Merge pull request #398 from atoma-network/ja-node-metrics

feat: add more detailed node metrics for p2p broadcasting
atoma-network · Feb 10, 2025 · e3eed5c · e3eed5c
2 parents d2f3d42 + e321cea
commit e3eed5c
Show file tree

Hide file tree

Showing 9 changed files with 570 additions and 183 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -49,6 +49,7 @@ lazy_static = "1.5.0"
 libp2p = "0.55.0"
 metrics = "0.23"
 metrics-exporter-prometheus = "0.14.0"
+nvml-wrapper = "0.10.0"
 once_cell = "1.20.3"
 prometheus = "0.13.4"
 rand = "0.8.5"
@@ -62,6 +63,7 @@ sha2 = "0.10.8"
 sqlx = "0.8.2"
 sui-keys = { git = "https://github.com/mystenlabs/sui", package = "sui-keys", tag = "testnet-v1.42.1" }
 sui-sdk = { git = "https://github.com/mystenlabs/sui", package = "sui-sdk", tag = "testnet-v1.42.1" }
+sysinfo = "0.33.1"
 tdx = { git = "https://github.com/automata-network/tdx-attestation-sdk.git", branch = "main" }
 tempfile = "3.16.0"
 thiserror = "2.0.11"

diff --git a/atoma-p2p/Cargo.toml b/atoma-p2p/Cargo.toml
@@ -26,9 +26,11 @@ libp2p = { workspace = true, features = [
 fastcrypto = { workspace = true }
 flume = { workspace = true }
 futures = { workspace = true }
+nvml-wrapper = { workspace = true }
 serde = { workspace = true, features = ["derive"] }
 sui-keys = { workspace = true }
 sui-sdk = { workspace = true }
+sysinfo = { workspace = true }
 thiserror = { workspace = true }
 tokio = { workspace = true, features = ["full"] }
 tracing = { workspace = true }

diff --git a/atoma-p2p/src/lib.rs b/atoma-p2p/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod config;
+pub mod metrics;
 pub mod service;
 pub mod timer;
 pub mod types;

diff --git a/atoma-p2p/src/metrics.rs b/atoma-p2p/src/metrics.rs
@@ -0,0 +1,118 @@
+use nvml_wrapper::{
+    enum_wrappers::device::TemperatureSensor,
+    struct_wrappers::device::{MemoryInfo, Utilization},
+    Nvml,
+};
+use serde::{Deserialize, Serialize};
+use sysinfo::{Networks, System};
+use thiserror::Error;
+use tracing::instrument;
+
+/// Structure to store the usage metrics for the node
+///
+/// This data is collected from the system and the GPU
+/// to be sent across the p2p network, for efficient request routing.
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub struct NodeMetrics {
+    /// The CPU usage of the node
+    pub cpu_usage: f32,
+    /// The amount of RAM used
+    pub ram_used: u64,
+    /// The total amount of RAM in the system
+    pub ram_total: u64,
+    /// The amount of RAM used in swap
+    pub ram_swap_used: u64,
+    /// The total amount of swap memory in the system
+    pub ram_swap_total: u64,
+    /// The number of CPUs in the system
+    pub num_cpus: u32,
+    /// The total number of bytes received from the network
+    pub network_rx: u64,
+    /// The total number of bytes transmitted to the network
+    pub network_tx: u64,
+    /// The number of GPUs in the system
+    pub num_gpus: u32,
+    /// The usage metrics for each GPU
+    pub gpus: Vec<GpuMetrics>,
+}
+
+/// Structure to store the usage metrics for each GPU
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub struct GpuMetrics {
+    /// The amount of memory used by the GPU
+    pub memory_used: u64,
+    /// The total amount of memory on the GPU
+    pub memory_total: u64,
+    /// The amount of free memory on the GPU
+    pub memory_free: u64,
+    /// The percentage of time the GPU was reading or writing
+    pub percentage_time_read_write: u32,
+    /// The percentage of time the GPU was executing
+    pub percentage_time_gpu_execution: u32,
+    /// The temperature of the GPU
+    pub temperature: u32,
+    /// The power usage of the GPU
+    pub power_usage: u32,
+}
+
+/// Returns the usage metrics for the node
+#[instrument(level = "info", target = "metrics")]
+pub fn compute_usage_metrics(mut sys: System) -> Result<NodeMetrics, NodeMetricsError> {
+    let nvml = Nvml::init()?;
+
+    let device_count = nvml.device_count()?;
+    let mut gpus = Vec::new();
+    for i in 0..device_count {
+        let device = nvml.device_by_index(i)?;
+        let Utilization { gpu, memory } = device.utilization_rates()?;
+        let MemoryInfo { used, total, free } = device.memory_info()?;
+        let temperature = device.temperature(TemperatureSensor::Gpu)?;
+        let power_usage = device.power_usage()?;
+        gpus.push(GpuMetrics {
+            memory_used: used,
+            memory_total: total,
+            memory_free: free,
+            percentage_time_read_write: memory,
+            percentage_time_gpu_execution: gpu,
+            temperature,
+            power_usage,
+        });
+    }
+
+    // Refresh the system information so we can get the latest metrics
+    sys.refresh_all();
+    let cpu_usage = sys.global_cpu_usage();
+    let ram_used = sys.used_memory();
+    let ram_total = sys.total_memory();
+    let ram_swap_used = sys.used_swap();
+    let ram_swap_total = sys.total_swap();
+    let num_cpus = sys.cpus().len();
+    let networks = Networks::new_with_refreshed_list();
+    let mut network_rx = 0;
+    let mut network_tx = 0;
+    for (_interface, data) in &networks {
+        network_rx += data.received();
+        network_tx += data.transmitted();
+    }
+
+    Ok(NodeMetrics {
+        cpu_usage,
+        ram_used,
+        ram_total,
+        ram_swap_used,
+        ram_swap_total,
+        num_cpus: u32::try_from(num_cpus)?,
+        network_rx,
+        network_tx,
+        num_gpus: device_count,
+        gpus,
+    })
+}
+
+#[derive(Debug, Error)]
+pub enum NodeMetricsError {
+    #[error("Nvml error: {0}")]
+    NvmlError(#[from] nvml_wrapper::error::NvmlError),
+    #[error("Failed to convert number of CPUs to u32: {0}")]
+    TryFromIntError(#[from] std::num::TryFromIntError),
+}