Skip to content

Commit

Permalink
Time-limit health checks and log them
Browse files Browse the repository at this point in the history
  • Loading branch information
slowli committed Feb 1, 2024
1 parent 00d7f98 commit f454764
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 14 deletions.
16 changes: 13 additions & 3 deletions core/lib/dal/src/healthcheck.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,18 @@ impl CheckHealth for ConnectionPoolHealthCheck {
async fn check_health(&self) -> Health {
// This check is rather feeble, plan to make reliable here:
// https://linear.app/matterlabs/issue/PLA-255/revamp-db-connection-health-check
self.connection_pool.access_storage().await.unwrap();
let details = ConnectionPoolHealthDetails::new(&self.connection_pool);
Health::from(HealthStatus::Ready).with_details(details)
match self.connection_pool.access_storage().await {
Ok(_) => {
let details = ConnectionPoolHealthDetails::new(&self.connection_pool);
Health::from(HealthStatus::Ready).with_details(details)
}
Err(err) => {
tracing::warn!("Failed acquiring DB connection for health check: {err:?}");
let details = serde_json::json!({
"error": format!("{err:?}"),
});
Health::from(HealthStatus::NotReady).with_details(details)
}
}
}
}
54 changes: 43 additions & 11 deletions core/lib/health_check/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::{collections::HashMap, thread};
use std::{collections::HashMap, thread, time::Duration};

// Public re-export for other crates to be able to implement the interface.
pub use async_trait::async_trait;
use futures::{future, FutureExt};
use futures::future;
use serde::Serialize;
use tokio::sync::watch;

Expand Down Expand Up @@ -81,13 +81,9 @@ pub struct AppHealth {
impl AppHealth {
/// Aggregates health info from the provided checks.
pub async fn new<T: AsRef<dyn CheckHealth>>(health_checks: &[T]) -> Self {
let check_futures = health_checks.iter().map(|check| {
let check_name = check.as_ref().name();
check
.as_ref()
.check_health()
.map(move |health| (check_name, health))
});
let check_futures = health_checks
.iter()
.map(|check| Self::check_health_with_time_limit(check.as_ref()));
let components: HashMap<_, _> = future::join_all(check_futures).await.into_iter().collect();

let aggregated_status = components
Expand All @@ -97,7 +93,40 @@ impl AppHealth {
.unwrap_or(HealthStatus::Ready);
let inner = aggregated_status.into();

Self { inner, components }
let this = Self { inner, components };
if !this.inner.status.is_ready() {
// Only log non-ready application health so that logs are not spammed without a reason.
tracing::debug!("Aggregated application health: {this:?}");
}
this
}

async fn check_health_with_time_limit(check: &dyn CheckHealth) -> (&'static str, Health) {
const WARNING_TIME_LIMIT: Duration = Duration::from_secs(3);
/// Chosen to be lesser than a typical HTTP client timeout (~30s).
const HARD_TIME_LIMIT: Duration = Duration::from_secs(20);

let check_name = check.name();
let timeout_at = tokio::time::Instant::now() + HARD_TIME_LIMIT;
let mut check_future = check.check_health();
match tokio::time::timeout(WARNING_TIME_LIMIT, &mut check_future).await {
Ok(output) => return (check_name, output),
Err(_) => {
tracing::info!(
"Health check `{check_name}` takes >{WARNING_TIME_LIMIT:?} to complete"
);
}
}

match tokio::time::timeout_at(timeout_at, check_future).await {
Ok(output) => (check_name, output),
Err(_) => {
tracing::warn!(
"Health check `{check_name}` timed out, taking >{HARD_TIME_LIMIT:?} to complete; marking as not ready"
);
(check_name, HealthStatus::NotReady.into())
}
}
}

pub fn is_ready(&self) -> bool {
Expand Down Expand Up @@ -166,7 +195,10 @@ impl HealthUpdater {
pub fn update(&self, health: Health) -> bool {
let old_health = self.health_sender.send_replace(health.clone());
if old_health != health {
tracing::debug!("changed health from {:?} to {:?}", old_health, health);
tracing::debug!(
"Changed health of `{}` from {old_health:?} to {health:?}",
self.name
);
return true;
}
false
Expand Down

0 comments on commit f454764

Please sign in to comment.