matter-labs · RomanBrodetski · Feb 26, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/core/bin/external_node/src/config/mod.rs b/core/bin/external_node/src/config/mod.rs
@@ -142,6 +142,14 @@ pub struct OptionalENConfig {
     /// Enabled JSON RPC API namespaces.
     api_namespaces: Option<Vec<Namespace>>,
 
+    // Health checks
+    /// Time limit in milliseconds to mark a health check as slow and log the corresponding warning.
+    /// If not specified, the default value in the health check crate will be used.
+    healthcheck_slow_time_limit_ms: Option<u64>,
+    /// Time limit in milliseconds to abort a health check and return "not ready" status for the corresponding component.
+    /// If not specified, the default value in the health check crate will be used.
+    healthcheck_hard_time_limit_ms: Option<u64>,
+
     // Gas estimation config
     /// The factor by which to scale the gasLimit
     #[serde(default = "OptionalENConfig::default_estimate_gas_scale_factor")]
@@ -368,6 +376,16 @@ impl OptionalENConfig {
     pub fn max_response_body_size(&self) -> usize {
         self.max_response_body_size_mb * BYTES_IN_MEGABYTE
     }
+
+    pub fn healthcheck_slow_time_limit(&self) -> Option<Duration> {
+        self.healthcheck_slow_time_limit_ms
+            .map(Duration::from_millis)
+    }
+
+    pub fn healthcheck_hard_time_limit(&self) -> Option<Duration> {
+        self.healthcheck_hard_time_limit_ms
+            .map(Duration::from_millis)
+    }
 }
 
 /// This part of the external node config is required for its operation.

diff --git a/core/bin/external_node/src/main.rs b/core/bin/external_node/src/main.rs
@@ -505,7 +505,10 @@ async fn main() -> anyhow::Result<()> {
 
     let main_node_client = <dyn MainNodeClient>::json_rpc(&main_node_url)
         .context("Failed creating JSON-RPC client for main node")?;
-    let app_health = Arc::new(AppHealthCheck::default());
+    let app_health = Arc::new(AppHealthCheck::new(
+        config.optional.healthcheck_slow_time_limit(),
+        config.optional.healthcheck_hard_time_limit(),
+    ));
     app_health.insert_custom_component(Arc::new(MainNodeHealthCheck::from(
         main_node_client.clone(),
     )));

diff --git a/core/lib/config/src/configs/api.rs b/core/lib/config/src/configs/api.rs
@@ -206,12 +206,26 @@ impl Web3JsonRpcConfig {
 pub struct HealthCheckConfig {
     /// Port to which the REST server is listening.
     pub port: u16,
+    /// Time limit in milliseconds to mark a health check as slow and log the corresponding warning.
+    /// If not specified, the default value in the health check crate will be used.
+    pub slow_time_limit_ms: Option<u64>,
+    /// Time limit in milliseconds to abort a health check and return "not ready" status for the corresponding component.
+    /// If not specified, the default value in the health check crate will be used.
+    pub hard_time_limit_ms: Option<u64>,
 }
 
 impl HealthCheckConfig {
     pub fn bind_addr(&self) -> SocketAddr {
         SocketAddr::new("0.0.0.0".parse().unwrap(), self.port)
     }
+
+    pub fn slow_time_limit(&self) -> Option<Duration> {
+        self.slow_time_limit_ms.map(Duration::from_millis)
+    }
+
+    pub fn hard_time_limit(&self) -> Option<Duration> {
+        self.hard_time_limit_ms.map(Duration::from_millis)
+    }
 }
 
 #[derive(Debug, Deserialize, Clone, PartialEq)]

diff --git a/core/lib/config/src/testonly.rs b/core/lib/config/src/testonly.rs
@@ -209,7 +209,11 @@ impl RandomConfig for configs::api::Web3JsonRpcConfig {
 
 impl RandomConfig for configs::api::HealthCheckConfig {
     fn sample(g: &mut Gen<impl Rng>) -> Self {
-        Self { port: g.gen() }
+        Self {
+            port: g.gen(),
+            slow_time_limit_ms: g.gen(),
+            hard_time_limit_ms: g.gen(),
+        }
     }
 }
 

diff --git a/core/lib/env_config/src/api.rs b/core/lib/env_config/src/api.rs
@@ -97,7 +97,11 @@ mod tests {
                 pushgateway_url: "http://127.0.0.1:9091".into(),
                 push_interval_ms: Some(100),
             },
-            healthcheck: HealthCheckConfig { port: 8081 },
+            healthcheck: HealthCheckConfig {
+                port: 8081,
+                slow_time_limit_ms: Some(250),
+                hard_time_limit_ms: Some(2_000),
+            },
             merkle_tree: MerkleTreeApiConfig { port: 8082 },
         }
     }
@@ -136,6 +140,8 @@ mod tests {
             API_PROMETHEUS_PUSHGATEWAY_URL="http://127.0.0.1:9091"
             API_PROMETHEUS_PUSH_INTERVAL_MS=100
             API_HEALTHCHECK_PORT=8081
+            API_HEALTHCHECK_SLOW_TIME_LIMIT_MS=250
+            API_HEALTHCHECK_HARD_TIME_LIMIT_MS=2000
             API_MERKLE_TREE_PORT=8082
         "#;
         lock.set_env(config);

diff --git a/core/lib/health_check/Cargo.toml b/core/lib/health_check/Cargo.toml
@@ -10,6 +10,8 @@ keywords = ["blockchain", "zksync"]
 categories = ["cryptography"]
 
 [dependencies]
+vise = { git = "https://github.com/matter-labs/vise.git", version = "0.1.0", rev = "1c9cc500e92cf9ea052b230e114a6f9cce4fb2c1" }
+
 async-trait = "0.1"
 futures = "0.3"
 serde = { version = "1.0", features = ["derive"] }

diff --git a/core/lib/health_check/README.md b/core/lib/health_check/README.md
@@ -0,0 +1,122 @@
+# Health Monitoring
+
+Healthcheck infrastructure for node components allowing components to signal their current health state. Health states
+for all components run by the node are aggregated and are exposed as an HTTP `GET /health` endpoint bound to a dedicated
+healthcheck port, both for the main node and external node. This endpoint can be used as a readiness probe for
+Kubernetes, or used in other automations.
+
+## Main concepts
+
+**Component** is a logically isolated part of a node that affects the ability of the node to handle requests (aka node
+health). Components are supposed to run indefinitely until the node receives a stop signal.
+
+- Internal components correspond to one or more Tokio tasks. Examples of internal components are: JSON-RPC API server,
+  Merkle tree, consistency checker, reorg detector.
+- External components correspond to another process that the node communicates with. Examples of external components
+  are: Postgres connection pool, main node JSON-RPC (for the external node).
+
+Each component can report its health, which consists of 2 parts:
+
+- **Status**, e.g., "not ready", "ready", "shut down", "panicked"; see the crate code for a full list.
+- **Details**, a JSON value with the component-specific schema. E.g., Merkle tree reports its L1 batch "cursor" as a
+  part of this information.
+
+Health from all components is aggregated into **application health**, which has its own status computed as the worst of
+component statuses. Application health is returned by the `/health` endpoint.
+
+## `/health` endpoint format
+
+`/health` will return current application health encoded as a JSON object. The HTTP status of the response is 20x if the
+application is healthy, and 50x if it is not.
+
+> **Warning.** The schema of data returned by the `/health` endpoint is not stable at this point and can change without
+> notice. Use at your own risk.
+
+<details>
+<summary>Example of endpoint output for an external node:</summary>
+
+```json
+{
+  "status": "ready",
+  "components": {
+    "sync_state": {
+      "status": "ready",
+      "details": {
+        "is_synced": true,
+        "local_block": 91,
+        "main_node_block": 91
+      }
+    },
+    "connection_pool": {
+      "status": "ready",
+      "details": {
+        "max_size": 50,
+        "pool_size": 10
+      }
+    },
+    "tree": {
+      "status": "ready",
+      "details": {
+        "leaf_count": 12624,
+        "mode": "full",
+        "next_l1_batch_number": 26,
+        "root_hash": "0x54d537798f9ebd1b6463e3773c3549a389709987d559fdcd8d402a652a33fb68",
+        "stage": "main_loop"
+      }
+    },
+    "snapshot_recovery": {
+      "status": "ready",
+      "details": {
+        "factory_deps_recovered": true,
+        "snapshot_l1_batch": 24,
+        "snapshot_miniblock": 89,
+        "storage_logs_chunk_count": 10,
+        "storage_logs_chunks_left_to_process": 0,
+        "tokens_recovered": true
+      }
+    },
+    "consistency_checker": {
+      "status": "ready",
+      "details": {
+        "first_checked_batch": 25,
+        "last_checked_batch": 25
+      }
+    },
+    "ws_api": {
+      "status": "ready"
+    },
+    "prometheus_exporter": {
+      "status": "ready"
+    },
+    "reorg_detector": {
+      "status": "ready",
+      "details": {
+        "last_correct_l1_batch": 25,
+        "last_correct_miniblock": 91
+      }
+    },
+    "main_node_http_rpc": {
+      "status": "ready"
+    },
+    "batch_status_updater": {
+      "status": "ready",
+      "details": {
+        "last_committed_l1_batch": 25,
+        "last_executed_l1_batch": 25,
+        "last_proven_l1_batch": 25
+      }
+    },
+    "commitment_generator": {
+      "status": "ready",
+      "details": {
+        "l1_batch_number": 25
+      }
+    },
+    "http_api": {
+      "status": "ready"
+    }
+  }
+}
+```
+
+</details>