Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions components/frontend/src/dynamo/frontend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ def parse_args():
type=validate_model_path,
help="Path to model directory on disk (e.g., /tmp/model_cache/lama3.2_1B/)",
)
parser.add_argument(
"--metrics-prefix",
type=str,
default=None,
help="Prefix for Dynamo frontend metrics. If unset, uses DYN_METRICS_PREFIX env var or 'dynamo_frontend'.",
)

flags = parser.parse_args()

Expand All @@ -146,6 +152,12 @@ async def async_main():
flags = parse_args()
is_static = bool(flags.static_endpoint) # true if the string has a value

# Configure Dynamo frontend HTTP service metrics prefix
if flags.metrics_prefix is not None:
prefix = flags.metrics_prefix.strip()
if prefix:
os.environ["DYN_METRICS_PREFIX"] = flags.metrics_prefix

runtime = DistributedRuntime(asyncio.get_running_loop(), is_static)

if flags.router_mode == "kv":
Expand Down
1 change: 1 addition & 0 deletions lib/llm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ proptest = "1.5.0"
reqwest = { workspace = true }
rstest = "0.18.2"
rstest_reuse = "0.7.0"
serial_test = "3"
tempfile = "3.17.1"
insta = { version = "1.41", features = [
"glob",
Expand Down
64 changes: 51 additions & 13 deletions lib/llm/src/http/service/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@ pub use prometheus::Registry;

use super::RouteDoc;

/// Metric prefix for all HTTP service metrics
// Default metric prefix
pub const FRONTEND_METRIC_PREFIX: &str = "dynamo_frontend";

// Environment variable that overrides the default metric prefix if provided
pub const METRICS_PREFIX_ENV: &str = "DYN_METRICS_PREFIX";

/// Value for the `status` label in the request counter for successful requests
pub const REQUEST_STATUS_SUCCESS: &str = "success";

Expand All @@ -27,9 +30,29 @@ pub const REQUEST_TYPE_STREAM: &str = "stream";
/// Partial value for the `type` label in the request counter for unary requests
pub const REQUEST_TYPE_UNARY: &str = "unary";

/// Helper function to construct metric names with the standard prefix
fn frontend_metric_name(suffix: &str) -> String {
format!("{}_{}", FRONTEND_METRIC_PREFIX, suffix)
fn sanitize_prometheus_prefix(raw: &str) -> String {
// Prometheus metric name pattern: [a-zA-Z_:][a-zA-Z0-9_:]*
let mut s: String = raw
.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '_' || c == ':' {
c
} else {
'_'
}
})
.collect();

if s.is_empty() {
return FRONTEND_METRIC_PREFIX.to_string();
}

let first = s.as_bytes()[0];
let valid_first = first.is_ascii_alphabetic() || first == b'_' || first == b':';
if !valid_first {
s.insert(0, '_');
}
s
}

pub struct Metrics {
Expand Down Expand Up @@ -107,16 +130,31 @@ impl Default for Metrics {
}

impl Metrics {
/// Create Metrics with the standard prefix defined by [`FRONTEND_METRIC_PREFIX`]
/// The following metrics will be created:
/// - `dynamo_frontend_requests_total` - IntCounterVec for the total number of requests processed
/// - `dynamo_frontend_inflight_requests` - IntGaugeVec for the number of inflight requests
/// - `dynamo_frontend_request_duration_seconds` - HistogramVec for the duration of requests
/// - `dynamo_frontend_input_sequence_tokens` - HistogramVec for input sequence length in tokens
/// - `dynamo_frontend_output_sequence_tokens` - HistogramVec for output sequence length in tokens
/// - `dynamo_frontend_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
/// - `dynamo_frontend_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
/// Create Metrics with the standard prefix defined by [`FRONTEND_METRIC_PREFIX`] or specify custom prefix via the following environment variable:
/// - `DYN_METRICS_PREFIX`: Override the default metrics prefix
///
/// The following metrics will be created with the configured prefix:
/// - `{prefix}_requests_total` - IntCounterVec for the total number of requests processed
/// - `{prefix}_inflight_requests` - IntGaugeVec for the number of inflight requests
/// - `{prefix}_request_duration_seconds` - HistogramVec for the duration of requests
/// - `{prefix}_input_sequence_tokens` - HistogramVec for input sequence length in tokens
/// - `{prefix}_output_sequence_tokens` - HistogramVec for output sequence length in tokens
/// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
/// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
pub fn new() -> Self {
let raw_prefix = std::env::var(METRICS_PREFIX_ENV)
.unwrap_or_else(|_| FRONTEND_METRIC_PREFIX.to_string());
let prefix = sanitize_prometheus_prefix(&raw_prefix);
if prefix != raw_prefix {
tracing::warn!(
raw=%raw_prefix,
sanitized=%prefix,
env=%METRICS_PREFIX_ENV,
"Sanitized HTTP metrics prefix"
);
}
let frontend_metric_name = |suffix: &str| format!("{}_{}", &prefix, suffix);

let request_counter = IntCounterVec::new(
Opts::new(
frontend_metric_name("requests_total"),
Expand Down
104 changes: 104 additions & 0 deletions lib/llm/tests/http_metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

use dynamo_llm::http::service::metrics::{self, Endpoint};
use dynamo_llm::http::service::service_v2::HttpService;
use dynamo_runtime::CancellationToken;
use serial_test::serial;
use std::{env, time::Duration};

#[tokio::test]
#[serial]
async fn metrics_prefix_default_then_env_override() {
// Case 1: default prefix
env::remove_var(metrics::METRICS_PREFIX_ENV);
let svc1 = HttpService::builder().port(9101).build().unwrap();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this looks like it's following examples in http-service.rs, and a bit sad that many tests are not using random available ports (port 0). For tests, it's best to use random available ports so that tests running in parallel, don't run into collisions with other existing services and/or tests. Something like this would be preferred:

async fn create_http_service_with_random_port() -> (HttpService, u16) {
    // Bind to port 0 to get a random available port
    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
    let actual_port = listener.local_addr().unwrap().port();
    
    // Drop the listener since HttpService will create its own
    drop(listener);
    
    // Create service with the actual port
    let service = HttpService::builder().port(actual_port).build().unwrap();
    
    (service, actual_port)
}

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will use random ports in the future.

let token1 = CancellationToken::new();
let _h1 = svc1.spawn(token1.clone()).await;
wait_for_metrics_ready(9101).await;

// Populate labeled metrics
let s1 = svc1.state_clone();
{
let _g = s1.metrics_clone().create_inflight_guard(
"test-model",
Endpoint::ChatCompletions,
false,
);
}
let body1 = reqwest::get("http://localhost:9101/metrics")
.await
.unwrap()
.text()
.await
.unwrap();
assert!(body1.contains("dynamo_frontend_requests_total"));
token1.cancel();

// Case 2: env override to prefix
env::set_var(metrics::METRICS_PREFIX_ENV, "custom_prefix");
let svc2 = HttpService::builder().port(9102).build().unwrap();
let token2 = CancellationToken::new();
let _h2 = svc2.spawn(token2.clone()).await;
wait_for_metrics_ready(9102).await;

// Populate labeled metrics
let s2 = svc2.state_clone();
{
let _g =
s2.metrics_clone()
.create_inflight_guard("test-model", Endpoint::ChatCompletions, true);
}
// Single fetch and assert
let body2 = reqwest::get("http://localhost:9102/metrics")
.await
.unwrap()
.text()
.await
.unwrap();
assert!(body2.contains("custom_prefix_requests_total"));
assert!(!body2.contains("dynamo_frontend_requests_total"));
token2.cancel();

// Case 3: invalid env prefix is sanitized
env::set_var(metrics::METRICS_PREFIX_ENV, "nv-llm/http service");
let svc3 = HttpService::builder().port(9103).build().unwrap();
let token3 = CancellationToken::new();
let _h3 = svc3.spawn(token3.clone()).await;
wait_for_metrics_ready(9103).await;

let s3 = svc3.state_clone();
{
let _g =
s3.metrics_clone()
.create_inflight_guard("test-model", Endpoint::ChatCompletions, true);
}
let body3 = reqwest::get("http://localhost:9103/metrics")
.await
.unwrap()
.text()
.await
.unwrap();
assert!(body3.contains("nv_llm_http_service_requests_total"));
assert!(!body3.contains("dynamo_frontend_requests_total"));
token3.cancel();

// Cleanup env to avoid leaking state
env::remove_var(metrics::METRICS_PREFIX_ENV);
}

// Poll /metrics until ready or timeout
async fn wait_for_metrics_ready(port: u16) {
let url = format!("http://localhost:{}/metrics", port);
let start = tokio::time::Instant::now();
let timeout = Duration::from_secs(5);
loop {
if start.elapsed() > timeout {
panic!("Timed out waiting for metrics endpoint at {}", url);
}
match reqwest::get(&url).await {
Ok(resp) if resp.status().is_success() => break,
_ => tokio::time::sleep(Duration::from_millis(50)).await,
}
}
}
Loading