Skip to content

Commit

Permalink
refactor(api): Add more components to healthcheck – follow-ups (#3337)
Browse files Browse the repository at this point in the history
## What ❔

Various minor follow-ups after
#3193:

- Rework app-level health details.
- Fix `execution_time` unit of measurement for the database health check
details.
- Rework the database health check: do not hold a DB connection all the
time; make it reactive.

## Why ❔

Makes the dependency graph lighter; simplifies maintenance.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `zkstack dev fmt` and `zkstack dev
lint`.
  • Loading branch information
slowli authored Dec 5, 2024
1 parent cf458a0 commit d0078df
Show file tree
Hide file tree
Showing 17 changed files with 111 additions and 153 deletions.
15 changes: 1 addition & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ members = [
# Test infrastructure
"core/tests/loadnext",
"core/tests/vm-benchmark",
"core/lib/bin_metadata",
]
resolver = "2"

Expand Down
1 change: 0 additions & 1 deletion core/bin/external_node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ zksync_health_check.workspace = true
zksync_web3_decl.workspace = true
zksync_types.workspace = true
zksync_block_reverter.workspace = true
zksync_shared_metrics.workspace = true
zksync_node_genesis.workspace = true
zksync_node_fee_model.workspace = true
zksync_node_db_pruner.workspace = true
Expand Down
3 changes: 0 additions & 3 deletions core/bin/external_node/src/metrics/framework.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use zksync_node_framework::{
implementations::resources::pools::{MasterPool, PoolResource},
FromContext, IntoContext, StopReceiver, Task, TaskId, WiringError, WiringLayer,
};
use zksync_shared_metrics::{GIT_METRICS, RUST_METRICS};
use zksync_types::{L1ChainId, L2ChainId, SLChainId};

use super::EN_METRICS;
Expand Down Expand Up @@ -39,8 +38,6 @@ impl WiringLayer for ExternalNodeMetricsLayer {
}

async fn wire(self, input: Self::Input) -> Result<Self::Output, WiringError> {
RUST_METRICS.initialize();
GIT_METRICS.initialize();
EN_METRICS.observe_config(
self.l1_chain_id,
self.sl_chain_id,
Expand Down
18 changes: 0 additions & 18 deletions core/lib/bin_metadata/Cargo.toml

This file was deleted.

8 changes: 4 additions & 4 deletions core/lib/dal/src/system_dal.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{collections::HashMap, time::Duration};

use chrono::DateTime;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use zksync_db_connection::{connection::Connection, error::DalResult, instrument::InstrumentExt};

Expand All @@ -14,11 +14,11 @@ pub(crate) struct TableSize {
pub total_size: u64,
}

#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatabaseMigration {
pub version: i64,
pub description: String,
pub installed_on: DateTime<chrono::Utc>,
pub installed_on: DateTime<Utc>,
pub success: bool,
pub checksum: String,
pub execution_time: Duration,
Expand Down Expand Up @@ -118,7 +118,7 @@ impl SystemDal<'_, '_> {
installed_on: row.installed_on,
success: row.success,
checksum: hex::encode(row.checksum),
execution_time: Duration::from_millis(u64::try_from(row.execution_time).unwrap_or(0)),
execution_time: Duration::from_nanos(u64::try_from(row.execution_time).unwrap_or(0)),
})
}
}
1 change: 0 additions & 1 deletion core/lib/health_check/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ serde_json.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["sync", "time"] }
tracing.workspace = true
zksync_bin_metadata.workspace = true

[dev-dependencies]
assert_matches.workspace = true
Expand Down
21 changes: 0 additions & 21 deletions core/lib/health_check/src/binary.rs

This file was deleted.

19 changes: 14 additions & 5 deletions core/lib/health_check/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,9 @@ pub use async_trait::async_trait;
use futures::future;
use serde::Serialize;
use tokio::sync::watch;
use zksync_bin_metadata::BIN_METADATA;

use self::metrics::{CheckResult, METRICS};
use crate::metrics::AppHealthCheckConfig;
use crate::metrics::{AppHealthCheckConfig, CheckResult, METRICS};

mod binary;
mod metrics;

#[cfg(test)]
Expand Down Expand Up @@ -114,6 +111,8 @@ pub struct AppHealthCheck {

#[derive(Debug, Clone)]
struct AppHealthCheckInner {
/// Application-level health details.
app_details: Option<serde_json::Value>,
components: Vec<Arc<dyn CheckHealth>>,
slow_time_limit: Duration,
hard_time_limit: Duration,
Expand All @@ -136,6 +135,7 @@ impl AppHealthCheck {

let inner = AppHealthCheckInner {
components: Vec::default(),
app_details: None,
slow_time_limit,
hard_time_limit,
};
Expand Down Expand Up @@ -181,6 +181,13 @@ impl AppHealthCheck {
}
}

/// Sets app-level health details. They can include build info etc.
pub fn set_details(&self, details: impl Serialize) {
let details = serde_json::to_value(details).expect("failed serializing app details");
let mut inner = self.inner.lock().expect("`AppHealthCheck` is poisoned");
inner.app_details = Some(details);
}

/// Inserts health check for a component.
///
/// # Errors
Expand Down Expand Up @@ -220,6 +227,7 @@ impl AppHealthCheck {
// Clone `inner` so that we don't hold a lock for them across a wait point.
let AppHealthCheckInner {
components,
app_details,
slow_time_limit,
hard_time_limit,
} = self
Expand All @@ -238,7 +246,8 @@ impl AppHealthCheck {
.map(|health| health.status)
.max_by_key(|status| status.priority_for_aggregation())
.unwrap_or(HealthStatus::Ready);
let inner = Health::with_details(aggregated_status.into(), BIN_METADATA);
let mut inner = Health::from(aggregated_status);
inner.details = app_details.clone();

let health = AppHealth { inner, components };
if !health.inner.status.is_healthy() {
Expand Down
1 change: 1 addition & 0 deletions core/lib/health_check/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ async fn aggregating_health_checks() {
let (first_check, first_updater) = ReactiveHealthCheck::new("first");
let (second_check, second_updater) = ReactiveHealthCheck::new("second");
let inner = AppHealthCheckInner {
app_details: None,
components: vec![Arc::new(first_check), Arc::new(second_check)],
slow_time_limit: AppHealthCheck::DEFAULT_SLOW_TIME_LIMIT,
hard_time_limit: AppHealthCheck::DEFAULT_HARD_TIME_LIMIT,
Expand Down
1 change: 0 additions & 1 deletion core/node/node_framework/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ zksync_vm_executor.workspace = true
zksync_state_keeper.workspace = true
zksync_consistency_checker.workspace = true
zksync_metadata_calculator.workspace = true
zksync_bin_metadata.workspace = true
zksync_node_sync.workspace = true
zksync_node_api_server.workspace = true
zksync_node_consensus.workspace = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ use std::sync::Arc;
use zksync_config::configs::api::HealthCheckConfig;
use zksync_health_check::AppHealthCheck;
use zksync_node_api_server::healthcheck::HealthCheckHandle;
use zksync_shared_metrics::metadata::{GitMetadata, RustMetadata, GIT_METRICS, RUST_METRICS};
use zksync_web3_decl::jsonrpsee::core::Serialize;

use crate::{
implementations::resources::healthcheck::AppHealthCheckResource,
Expand All @@ -12,6 +14,13 @@ use crate::{
FromContext, IntoContext,
};

/// Full metadata of the compiled binary.
#[derive(Debug, Serialize)]
pub struct BinMetadata {
pub rust: &'static RustMetadata,
pub git: &'static GitMetadata,
}

/// Wiring layer for health check server
///
/// Expects other layers to insert different components' health checks
Expand Down Expand Up @@ -73,8 +82,12 @@ impl Task for HealthCheckTask {
}

async fn run(mut self: Box<Self>, mut stop_receiver: StopReceiver) -> anyhow::Result<()> {
self.app_health_check.set_details(BinMetadata {
rust: RUST_METRICS.initialize(),
git: GIT_METRICS.initialize(),
});
let handle =
HealthCheckHandle::spawn_server(self.config.bind_addr(), self.app_health_check.clone());
HealthCheckHandle::spawn_server(self.config.bind_addr(), self.app_health_check);
stop_receiver.0.changed().await?;
handle.stop().await;

Expand Down
Loading

0 comments on commit d0078df

Please sign in to comment.