Skip to content

Commit 880838b

Browse files
keivenchangnv-anants
authored andcommitted
refactor: move uptime tracking from system_status_server(HTTP) to DRT level (#2587)
Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>
1 parent d9d384e commit 880838b

File tree

4 files changed

+165
-116
lines changed

4 files changed

+165
-116
lines changed

lib/runtime/src/distributed.rs

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,13 @@ impl DistributedRuntime {
117117
});
118118
distributed_runtime.register_metrics_callback(drt_hierarchies, nats_client_callback);
119119

120+
// Initialize the uptime gauge in SystemHealth
121+
distributed_runtime
122+
.system_health
123+
.lock()
124+
.unwrap()
125+
.initialize_uptime_gauge(&distributed_runtime)?;
126+
120127
// Handle system status server initialization
121128
if let Some(cancel_token) = cancel_token {
122129
// System server is enabled - start both the state and HTTP server
@@ -153,17 +160,7 @@ impl DistributedRuntime {
153160
}
154161
}
155162
} else {
156-
// System server HTTP is disabled, but still create the state for metrics
157-
// This ensures uptime_seconds metric is always registered
158-
let system_status_state = crate::system_status_server::SystemStatusState::new(
159-
Arc::new(distributed_runtime.clone()),
160-
)?;
161-
162-
// Initialize the start time for uptime tracking
163-
if let Err(e) = system_status_state.initialize_start_time() {
164-
tracing::warn!("Failed to initialize system status start time: {}", e);
165-
}
166-
163+
// System server HTTP is disabled, but uptime metrics are still being tracked via SystemHealth
167164
tracing::debug!(
168165
"System status server HTTP endpoints disabled, but uptime metrics are being tracked"
169166
);
@@ -349,7 +346,7 @@ impl DistributedConfig {
349346
}
350347

351348
#[cfg(test)]
352-
pub mod test_helpers {
349+
pub mod distributed_test_utils {
353350
//! Common test helper functions for DistributedRuntime tests
354351
// TODO: Use in-memory DistributedRuntime for tests instead of full runtime when available.
355352

@@ -364,3 +361,61 @@ pub mod test_helpers {
364361
.unwrap()
365362
}
366363
}
364+
365+
#[cfg(feature = "integration")]
366+
#[cfg(test)]
367+
mod tests {
368+
use super::distributed_test_utils::create_test_drt_async;
369+
370+
#[tokio::test]
371+
async fn test_drt_uptime_after_delay_system_disabled() {
372+
// Test uptime with system status server disabled
373+
temp_env::async_with_vars([("DYN_SYSTEM_ENABLED", Some("false"))], async {
374+
// Start a DRT
375+
let drt = create_test_drt_async().await;
376+
377+
// Wait 50ms
378+
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
379+
380+
// Check that uptime is 50+ ms
381+
let uptime = drt.system_health.lock().unwrap().uptime();
382+
assert!(
383+
uptime >= std::time::Duration::from_millis(50),
384+
"Expected uptime to be at least 50ms, but got {:?}",
385+
uptime
386+
);
387+
388+
println!(
389+
"✓ DRT uptime test passed (system disabled): uptime = {:?}",
390+
uptime
391+
);
392+
})
393+
.await;
394+
}
395+
396+
#[tokio::test]
397+
async fn test_drt_uptime_after_delay_system_enabled() {
398+
// Test uptime with system status server enabled
399+
temp_env::async_with_vars([("DYN_SYSTEM_ENABLED", Some("true"))], async {
400+
// Start a DRT
401+
let drt = create_test_drt_async().await;
402+
403+
// Wait 50ms
404+
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
405+
406+
// Check that uptime is 50+ ms
407+
let uptime = drt.system_health.lock().unwrap().uptime();
408+
assert!(
409+
uptime >= std::time::Duration::from_millis(50),
410+
"Expected uptime to be at least 50ms, but got {:?}",
411+
uptime
412+
);
413+
414+
println!(
415+
"✓ DRT uptime test passed (system enabled): uptime = {:?}",
416+
uptime
417+
);
418+
})
419+
.await;
420+
}
421+
}

lib/runtime/src/lib.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
use std::{
2222
collections::HashMap,
2323
sync::{Arc, OnceLock, Weak},
24+
time::Instant,
2425
};
2526
use tokio::sync::Mutex;
2627

@@ -90,6 +91,8 @@ pub struct SystemHealth {
9091
use_endpoint_health_status: Vec<String>,
9192
health_path: String,
9293
live_path: String,
94+
start_time: Instant,
95+
uptime_gauge: OnceLock<prometheus::Gauge>,
9396
}
9497

9598
impl SystemHealth {
@@ -109,6 +112,8 @@ impl SystemHealth {
109112
use_endpoint_health_status,
110113
health_path,
111114
live_path,
115+
start_time: Instant::now(),
116+
uptime_gauge: OnceLock::new(),
112117
}
113118
}
114119
pub fn set_health_status(&mut self, status: HealthStatus) {
@@ -145,6 +150,34 @@ impl SystemHealth {
145150

146151
(healthy, endpoints)
147152
}
153+
154+
/// Initialize the uptime gauge using the provided metrics registry
155+
pub fn initialize_uptime_gauge<T: crate::metrics::MetricsRegistry>(
156+
&self,
157+
registry: &T,
158+
) -> anyhow::Result<()> {
159+
let gauge = registry.create_gauge(
160+
"uptime_seconds",
161+
"Total uptime of the DistributedRuntime in seconds",
162+
&[],
163+
)?;
164+
self.uptime_gauge
165+
.set(gauge)
166+
.map_err(|_| anyhow::anyhow!("uptime_gauge already initialized"))?;
167+
Ok(())
168+
}
169+
170+
/// Get the current uptime as a Duration
171+
pub fn uptime(&self) -> std::time::Duration {
172+
self.start_time.elapsed()
173+
}
174+
175+
/// Update the uptime gauge with the current uptime value
176+
pub fn update_uptime_gauge(&self) {
177+
if let Some(gauge) = self.uptime_gauge.get() {
178+
gauge.set(self.uptime().as_secs_f64());
179+
}
180+
}
148181
}
149182

150183
/// Type alias for runtime callback functions to reduce complexity

lib/runtime/src/metrics.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,7 @@ mod test_metricsregistry_units {
913913
#[cfg(test)]
914914
mod test_metricsregistry_prefixes {
915915
use super::*;
916-
use crate::distributed::test_helpers::create_test_drt_async;
916+
use crate::distributed::distributed_test_utils::create_test_drt_async;
917917
use prometheus::core::Collector;
918918

919919
#[tokio::test]
@@ -1047,7 +1047,7 @@ mod test_metricsregistry_prometheus_fmt_outputs {
10471047
use super::prometheus_names::{COMPONENT_NATS_METRICS, DRT_NATS_METRICS};
10481048
use super::prometheus_names::{nats_client, nats_service};
10491049
use super::*;
1050-
use crate::distributed::test_helpers::create_test_drt_async;
1050+
use crate::distributed::distributed_test_utils::create_test_drt_async;
10511051
use prometheus::Counter;
10521052
use std::sync::Arc;
10531053

@@ -1308,7 +1308,7 @@ mod test_metricsregistry_nats {
13081308
use super::prometheus_names::{COMPONENT_NATS_METRICS, DRT_NATS_METRICS};
13091309
use super::prometheus_names::{nats_client, nats_service};
13101310
use super::*;
1311-
use crate::distributed::test_helpers::create_test_drt_async;
1311+
use crate::distributed::distributed_test_utils::create_test_drt_async;
13121312
use crate::pipeline::PushRouter;
13131313
use crate::{DistributedRuntime, Runtime};
13141314
use tokio::time::{Duration, sleep};

0 commit comments

Comments
 (0)