Skip to content

Commit

Permalink
Merge pull request #398 from atoma-network/ja-node-metrics
Browse files Browse the repository at this point in the history
feat: add more detailed node metrics for p2p broadcasting
  • Loading branch information
maschad authored Feb 10, 2025
2 parents d2f3d42 + e321cea commit e3eed5c
Show file tree
Hide file tree
Showing 9 changed files with 570 additions and 183 deletions.
197 changes: 162 additions & 35 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ lazy_static = "1.5.0"
libp2p = "0.55.0"
metrics = "0.23"
metrics-exporter-prometheus = "0.14.0"
nvml-wrapper = "0.10.0"
once_cell = "1.20.3"
prometheus = "0.13.4"
rand = "0.8.5"
Expand All @@ -62,6 +63,7 @@ sha2 = "0.10.8"
sqlx = "0.8.2"
sui-keys = { git = "https://github.com/mystenlabs/sui", package = "sui-keys", tag = "testnet-v1.42.1" }
sui-sdk = { git = "https://github.com/mystenlabs/sui", package = "sui-sdk", tag = "testnet-v1.42.1" }
sysinfo = "0.33.1"
tdx = { git = "https://github.com/automata-network/tdx-attestation-sdk.git", branch = "main" }
tempfile = "3.16.0"
thiserror = "2.0.11"
Expand Down
2 changes: 2 additions & 0 deletions atoma-p2p/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ libp2p = { workspace = true, features = [
fastcrypto = { workspace = true }
flume = { workspace = true }
futures = { workspace = true }
nvml-wrapper = { workspace = true }
serde = { workspace = true, features = ["derive"] }
sui-keys = { workspace = true }
sui-sdk = { workspace = true }
sysinfo = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true, features = ["full"] }
tracing = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions atoma-p2p/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pub mod config;
pub mod metrics;
pub mod service;
pub mod timer;
pub mod types;
Expand Down
118 changes: 118 additions & 0 deletions atoma-p2p/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
use nvml_wrapper::{
enum_wrappers::device::TemperatureSensor,
struct_wrappers::device::{MemoryInfo, Utilization},
Nvml,
};
use serde::{Deserialize, Serialize};
use sysinfo::{Networks, System};
use thiserror::Error;
use tracing::instrument;

/// Structure to store the usage metrics for the node
///
/// This data is collected from the system and the GPU
/// to be sent across the p2p network, for efficient request routing.
#[derive(Debug, Serialize, Deserialize, Default)]
pub struct NodeMetrics {
/// The CPU usage of the node
pub cpu_usage: f32,
/// The amount of RAM used
pub ram_used: u64,
/// The total amount of RAM in the system
pub ram_total: u64,
/// The amount of RAM used in swap
pub ram_swap_used: u64,
/// The total amount of swap memory in the system
pub ram_swap_total: u64,
/// The number of CPUs in the system
pub num_cpus: u32,
/// The total number of bytes received from the network
pub network_rx: u64,
/// The total number of bytes transmitted to the network
pub network_tx: u64,
/// The number of GPUs in the system
pub num_gpus: u32,
/// The usage metrics for each GPU
pub gpus: Vec<GpuMetrics>,
}

/// Structure to store the usage metrics for each GPU
#[derive(Debug, Serialize, Deserialize, Default)]
pub struct GpuMetrics {
/// The amount of memory used by the GPU
pub memory_used: u64,
/// The total amount of memory on the GPU
pub memory_total: u64,
/// The amount of free memory on the GPU
pub memory_free: u64,
/// The percentage of time the GPU was reading or writing
pub percentage_time_read_write: u32,
/// The percentage of time the GPU was executing
pub percentage_time_gpu_execution: u32,
/// The temperature of the GPU
pub temperature: u32,
/// The power usage of the GPU
pub power_usage: u32,
}

/// Returns the usage metrics for the node
#[instrument(level = "info", target = "metrics")]
pub fn compute_usage_metrics(mut sys: System) -> Result<NodeMetrics, NodeMetricsError> {
let nvml = Nvml::init()?;

let device_count = nvml.device_count()?;
let mut gpus = Vec::new();
for i in 0..device_count {
let device = nvml.device_by_index(i)?;
let Utilization { gpu, memory } = device.utilization_rates()?;
let MemoryInfo { used, total, free } = device.memory_info()?;
let temperature = device.temperature(TemperatureSensor::Gpu)?;
let power_usage = device.power_usage()?;
gpus.push(GpuMetrics {
memory_used: used,
memory_total: total,
memory_free: free,
percentage_time_read_write: memory,
percentage_time_gpu_execution: gpu,
temperature,
power_usage,
});
}

// Refresh the system information so we can get the latest metrics
sys.refresh_all();
let cpu_usage = sys.global_cpu_usage();
let ram_used = sys.used_memory();
let ram_total = sys.total_memory();
let ram_swap_used = sys.used_swap();
let ram_swap_total = sys.total_swap();
let num_cpus = sys.cpus().len();
let networks = Networks::new_with_refreshed_list();
let mut network_rx = 0;
let mut network_tx = 0;
for (_interface, data) in &networks {
network_rx += data.received();
network_tx += data.transmitted();
}

Ok(NodeMetrics {
cpu_usage,
ram_used,
ram_total,
ram_swap_used,
ram_swap_total,
num_cpus: u32::try_from(num_cpus)?,
network_rx,
network_tx,
num_gpus: device_count,
gpus,
})
}

#[derive(Debug, Error)]
pub enum NodeMetricsError {
#[error("Nvml error: {0}")]
NvmlError(#[from] nvml_wrapper::error::NvmlError),
#[error("Failed to convert number of CPUs to u32: {0}")]
TryFromIntError(#[from] std::num::TryFromIntError),
}
Loading

0 comments on commit e3eed5c

Please sign in to comment.