feat: add runtime config metrics with periodic polling

keivenchang · keivenchang · commit 83cb3feaee82 · 2025-09-18T03:43:36.000Z
- Add new Prometheus metrics for model runtime configuration
- Include metrics for total_kv_blocks, max_num_seqs, max_num_batched_tokens
- Add MDC metrics for context_length, kv_cache_block_size, migration_limit
- Implement model health status tracking (healthy/unhealthy/unknown)
- Add background polling task to keep metrics current as backends change
- Preserve all historical data - metrics are never removed, only marked unhealthy
- Configurable poll interval via DYN_RUNTIME_CONFIG_METRICS_POLL_INTERVAL_SECS

Signed-off-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
diff --git a/lib/llm/src/http/service/metrics.rs b/lib/llm/src/http/service/metrics.rs
@@ -18,6 +18,9 @@ use std::{
     time::{Duration, Instant},
 };
 
+use crate::discovery::ModelEntry;
+use crate::local_model::runtime_config::ModelRuntimeConfig;
+
 pub use prometheus::Registry;
 
 use super::RouteDoc;
@@ -32,6 +35,15 @@ pub struct Metrics {
     output_sequence_length: HistogramVec,
     time_to_first_token: HistogramVec,
     inter_token_latency: HistogramVec,
+
+    // Runtime configuration metrics
+    model_total_kv_blocks: IntGaugeVec,
+    model_max_num_seqs: IntGaugeVec,
+    model_max_num_batched_tokens: IntGaugeVec,
+    model_context_length: IntGaugeVec,
+    model_kv_cache_block_size: IntGaugeVec,
+    model_migration_limit: IntGaugeVec,
+    model_health_status: IntGaugeVec,
 }
 
 // Inflight tracks requests from HTTP handler start until complete response is finished.
@@ -123,6 +135,21 @@ impl Metrics {
     /// - `{prefix}_output_sequence_tokens` - HistogramVec for output sequence length in tokens
     /// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
     /// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
+    /// - `{prefix}_model_total_kv_blocks` - IntGaugeVec for total KV cache blocks available per model
+    /// - `{prefix}_model_max_num_seqs` - IntGaugeVec for maximum sequences per model
+    /// - `{prefix}_model_max_num_batched_tokens` - IntGaugeVec for maximum batched tokens per model
+    /// - `{prefix}_model_context_length` - IntGaugeVec for maximum context length per model
+    /// - `{prefix}_model_kv_cache_block_size` - IntGaugeVec for KV cache block size per model
+    /// - `{prefix}_model_migration_limit` - IntGaugeVec for request migration limit per model
+    /// - `{prefix}_model_health_status` - IntGaugeVec for model health status (1=healthy, 0=unhealthy, -1=unknown)
+    ///
+    /// ## Runtime Config Polling Configuration
+    ///
+    /// The polling behavior can be configured via environment variables:
+    /// - `DYN_RUNTIME_CONFIG_METRICS_POLL_INTERVAL_SECS`: Poll interval in seconds (default: 30)
+    ///
+    /// Metrics are never removed to preserve historical data. Models are marked as
+    /// healthy (1) or unhealthy (0) based on their current availability.
     pub fn new() -> Self {
         let raw_prefix = std::env::var(frontend_service::METRICS_PREFIX_ENV)
             .unwrap_or_else(|_| name_prefix::FRONTEND.to_string());
@@ -232,6 +259,70 @@ impl Metrics {
         )
         .unwrap();
 
+        // Runtime configuration metrics
+        let model_total_kv_blocks = IntGaugeVec::new(
+            Opts::new(
+                frontend_metric_name("model_total_kv_blocks"),
+                "Total KV cache blocks available for the model",
+            ),
+            &["model"],
+        )
+        .unwrap();
+
+        let model_max_num_seqs = IntGaugeVec::new(
+            Opts::new(
+                frontend_metric_name("model_max_num_seqs"),
+                "Maximum number of sequences the model can handle",
+            ),
+            &["model"],
+        )
+        .unwrap();
+
+        let model_max_num_batched_tokens = IntGaugeVec::new(
+            Opts::new(
+                frontend_metric_name("model_max_num_batched_tokens"),
+                "Maximum number of batched tokens for the model",
+            ),
+            &["model"],
+        )
+        .unwrap();
+
+        let model_context_length = IntGaugeVec::new(
+            Opts::new(
+                frontend_metric_name("model_context_length"),
+                "Maximum context length in tokens for the model",
+            ),
+            &["model"],
+        )
+        .unwrap();
+
+        let model_kv_cache_block_size = IntGaugeVec::new(
+            Opts::new(
+                frontend_metric_name("model_kv_cache_block_size"),
+                "KV cache block size in tokens for the model",
+            ),
+            &["model"],
+        )
+        .unwrap();
+
+        let model_migration_limit = IntGaugeVec::new(
+            Opts::new(
+                frontend_metric_name("model_migration_limit"),
+                "Maximum number of request migrations allowed for the model",
+            ),
+            &["model"],
+        )
+        .unwrap();
+
+        let model_health_status = IntGaugeVec::new(
+            Opts::new(
+                frontend_metric_name("model_health_status"),
+                "Health status of the model (1=healthy, 0=unhealthy, -1=unknown)",
+            ),
+            &["model"],
+        )
+        .unwrap();
+
         Metrics {
             request_counter,
             inflight_gauge,
@@ -242,6 +333,13 @@ impl Metrics {
             output_sequence_length,
             time_to_first_token,
             inter_token_latency,
+            model_total_kv_blocks,
+            model_max_num_seqs,
+            model_max_num_batched_tokens,
+            model_context_length,
+            model_kv_cache_block_size,
+            model_migration_limit,
+            model_health_status,
         }
     }
 
@@ -330,9 +428,200 @@ impl Metrics {
         registry.register(Box::new(self.output_sequence_length.clone()))?;
         registry.register(Box::new(self.time_to_first_token.clone()))?;
         registry.register(Box::new(self.inter_token_latency.clone()))?;
+
+        // Register runtime configuration metrics
+        registry.register(Box::new(self.model_total_kv_blocks.clone()))?;
+        registry.register(Box::new(self.model_max_num_seqs.clone()))?;
+        registry.register(Box::new(self.model_max_num_batched_tokens.clone()))?;
+        registry.register(Box::new(self.model_context_length.clone()))?;
+        registry.register(Box::new(self.model_kv_cache_block_size.clone()))?;
+        registry.register(Box::new(self.model_migration_limit.clone()))?;
+        registry.register(Box::new(self.model_health_status.clone()))?;
+
         Ok(())
     }
 
+    /// Update runtime configuration metrics for a model
+    /// This should be called when model runtime configuration is available or updated
+    pub fn update_runtime_config_metrics(
+        &self,
+        model_name: &str,
+        runtime_config: &ModelRuntimeConfig,
+    ) {
+        if let Some(total_kv_blocks) = runtime_config.total_kv_blocks {
+            self.model_total_kv_blocks
+                .with_label_values(&[model_name])
+                .set(total_kv_blocks as i64);
+        }
+
+        if let Some(max_num_seqs) = runtime_config.max_num_seqs {
+            self.model_max_num_seqs
+                .with_label_values(&[model_name])
+                .set(max_num_seqs as i64);
+        }
+
+        if let Some(max_batched_tokens) = runtime_config.max_num_batched_tokens {
+            self.model_max_num_batched_tokens
+                .with_label_values(&[model_name])
+                .set(max_batched_tokens as i64);
+        }
+    }
+
+    /// Update model deployment card metrics for a model
+    /// This should be called when model deployment card information is available
+    pub fn update_mdc_metrics(
+        &self,
+        model_name: &str,
+        context_length: u32,
+        kv_cache_block_size: u32,
+        migration_limit: u32,
+    ) {
+        self.model_context_length
+            .with_label_values(&[model_name])
+            .set(context_length as i64);
+
+        self.model_kv_cache_block_size
+            .with_label_values(&[model_name])
+            .set(kv_cache_block_size as i64);
+
+        self.model_migration_limit
+            .with_label_values(&[model_name])
+            .set(migration_limit as i64);
+    }
+
+    /// Set model health status
+    /// 1 = healthy (model is active and responding)
+    /// 0 = unhealthy (model is known but not responding)
+    /// -1 = unknown (model status is unclear)
+    pub fn set_model_health_status(&self, model_name: &str, healthy: bool) {
+        let status = if healthy { 1 } else { 0 };
+        self.model_health_status
+            .with_label_values(&[model_name])
+            .set(status);
+    }
+
+    /// Mark model as having unknown health status
+    pub fn set_model_health_unknown(&self, model_name: &str) {
+        self.model_health_status
+            .with_label_values(&[model_name])
+            .set(-1);
+    }
+
+    /// Update metrics from a ModelEntry
+    /// This is a convenience method that extracts runtime config from a ModelEntry
+    /// and updates the appropriate metrics
+    pub fn update_metrics_from_model_entry(&self, model_entry: &ModelEntry) {
+        if let Some(runtime_config) = &model_entry.runtime_config {
+            self.update_runtime_config_metrics(&model_entry.name, runtime_config);
+        }
+    }
+
+    /// Update metrics from a ModelEntry and its ModelDeploymentCard
+    /// This updates both runtime config metrics and MDC-specific metrics
+    pub async fn update_metrics_from_model_entry_with_mdc(
+        &self,
+        model_entry: &ModelEntry,
+        etcd_client: &dynamo_runtime::transports::etcd::Client,
+    ) -> anyhow::Result<()> {
+        // Update runtime config metrics
+        if let Some(runtime_config) = &model_entry.runtime_config {
+            self.update_runtime_config_metrics(&model_entry.name, runtime_config);
+        }
+
+        // Load and update MDC metrics
+        match model_entry.load_mdc(etcd_client).await {
+            Ok(mdc) => {
+                self.update_mdc_metrics(
+                    &model_entry.name,
+                    mdc.context_length,
+                    mdc.kv_cache_block_size,
+                    mdc.migration_limit,
+                );
+                Ok(())
+            }
+            Err(e) => {
+                tracing::warn!(
+                    model = %model_entry.name,
+                    error = %e,
+                    "Failed to load ModelDeploymentCard for metrics update"
+                );
+                Err(e)
+            }
+        }
+    }
+
+    /// Start a background task that periodically updates runtime config metrics
+    /// This polls the ModelManager for current models and updates metrics accordingly
+    /// Models are never removed - only marked as healthy/unhealthy to preserve historical data
+    pub fn start_runtime_config_polling_task(
+        metrics: Arc<Self>,
+        manager: Arc<crate::discovery::ModelManager>,
+        etcd_client: Option<dynamo_runtime::transports::etcd::Client>,
+        poll_interval: Duration,
+    ) -> tokio::task::JoinHandle<()> {
+        tokio::spawn(async move {
+            let mut interval = tokio::time::interval(poll_interval);
+            let mut known_models = std::collections::HashSet::new();
+
+            tracing::info!(
+                interval_secs = poll_interval.as_secs(),
+                "Starting runtime config metrics polling task (metrics never removed)"
+            );
+
+            loop {
+                interval.tick().await;
+
+                // Get current model entries from the manager
+                let current_entries = manager.get_model_entries();
+                let mut current_models = std::collections::HashSet::new();
+
+                // Update metrics for current models
+                for entry in current_entries {
+                    current_models.insert(entry.name.clone());
+
+                    // Mark model as healthy since it's currently active
+                    metrics.set_model_health_status(&entry.name, true);
+
+                    // Update runtime config metrics if available
+                    if let Some(runtime_config) = &entry.runtime_config {
+                        metrics.update_runtime_config_metrics(&entry.name, runtime_config);
+                    }
+
+                    // Optionally load MDC for additional metrics if etcd is available
+                    if let Some(ref etcd) = etcd_client
+                        && let Err(e) = metrics
+                            .update_metrics_from_model_entry_with_mdc(&entry, etcd)
+                            .await
+                    {
+                        tracing::debug!(
+                            model = %entry.name,
+                            error = %e,
+                            "Failed to update MDC metrics (this is normal if MDC is not available)"
+                        );
+                    }
+                }
+
+                // Mark models that are no longer active as unhealthy (but keep their metrics)
+                for model_name in known_models.difference(&current_models) {
+                    metrics.set_model_health_status(model_name, false);
+                    tracing::debug!(
+                        model = %model_name,
+                        "Model no longer active, marked as unhealthy (metrics preserved)"
+                    );
+                }
+
+                // Update our known models set
+                known_models.extend(current_models.iter().cloned());
+
+                tracing::trace!(
+                    active_models = current_models.len(),
+                    total_known_models = known_models.len(),
+                    "Updated runtime config metrics for active models"
+                );
+            }
+        })
+    }
+
     /// Create a new [`InflightGuard`] for the given model and annotate if its a streaming request,
     /// and the kind of endpoint that was hit
     ///
diff --git a/lib/llm/src/http/service/service_v2.rs b/lib/llm/src/http/service/service_v2.rs
@@ -294,6 +294,7 @@ impl HttpServiceConfigBuilder {
         let config: HttpServiceConfig = self.build_internal()?;
 
         let model_manager = Arc::new(ModelManager::new());
+        let etcd_client = config.etcd_client.clone();
         let state = Arc::new(State::new_with_etcd(model_manager, config.etcd_client));
 
         state
@@ -313,6 +314,24 @@ impl HttpServiceConfigBuilder {
         let registry = metrics::Registry::new();
         state.metrics_clone().register(&registry)?;
 
+        // Start background task to poll runtime config metrics
+        // Poll every 30 seconds to keep metrics current as backends come and go
+        // Metrics are never removed - only marked as healthy/unhealthy
+        let poll_interval = Duration::from_secs(
+            std::env::var("DYN_RUNTIME_CONFIG_METRICS_POLL_INTERVAL_SECS")
+                .ok()
+                .and_then(|s| s.parse().ok())
+                .unwrap_or(30),
+        );
+
+        let _polling_task = super::metrics::Metrics::start_runtime_config_polling_task(
+            state.metrics_clone(),
+            state.manager_clone(),
+            etcd_client,
+            poll_interval,
+        );
+        // Note: We don't need to store the JoinHandle as the task will run for the lifetime of the service
+
         let mut router = axum::Router::new();
 
         let mut all_docs = Vec::new();