Skip to content

Commit 82120ba

Browse files
committed
fix: sanitize metric prefix and improve test reliability
1 parent d94e47c commit 82120ba

File tree

5 files changed

+147
-21
lines changed

5 files changed

+147
-21
lines changed

Cargo.lock

Lines changed: 41 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/frontend/src/dynamo/frontend/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,9 @@ async def async_main():
154154

155155
# Configure Dynamo frontend HTTP service metrics prefix
156156
if flags.metrics_prefix is not None:
157-
os.environ["DYN_METRICS_PREFIX"] = flags.metrics_prefix
157+
prefix = flags.metrics_prefix.strip()
158+
if prefix:
159+
os.environ["DYN_METRICS_PREFIX"] = flags.metrics_prefix
158160

159161
runtime = DistributedRuntime(asyncio.get_running_loop(), is_static)
160162

lib/llm/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ proptest = "1.5.0"
143143
reqwest = { workspace = true }
144144
rstest = "0.18.2"
145145
rstest_reuse = "0.7.0"
146+
serial_test = "3"
146147
tempfile = "3.17.1"
147148
insta = { version = "1.41", features = [
148149
"glob",

lib/llm/src/http/service/metrics.rs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,25 @@ pub const REQUEST_TYPE_STREAM: &str = "stream";
3030
/// Partial value for the `type` label in the request counter for unary requests
3131
pub const REQUEST_TYPE_UNARY: &str = "unary";
3232

33+
fn sanitize_prometheus_prefix(raw: &str) -> String {
34+
// Prometheus metric name pattern: [a-zA-Z_:][a-zA-Z0-9_:]*
35+
let mut s: String = raw
36+
.chars()
37+
.map(|c| if c.is_ascii_alphanumeric() || c == '_' || c == ':' { c } else { '_' })
38+
.collect();
39+
40+
if s.is_empty() {
41+
return FRONTEND_METRIC_PREFIX.to_string();
42+
}
43+
44+
let first = s.as_bytes()[0];
45+
let valid_first = first.is_ascii_alphabetic() || first == b'_' || first == b':';
46+
if !valid_first {
47+
s.insert(0, '_');
48+
}
49+
s
50+
}
51+
3352
pub struct Metrics {
3453
request_counter: IntCounterVec,
3554
inflight_gauge: IntGaugeVec,
@@ -117,8 +136,17 @@ impl Metrics {
117136
/// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
118137
/// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
119138
pub fn new() -> Self {
120-
let prefix = std::env::var(METRICS_PREFIX_ENV)
139+
let raw_prefix = std::env::var(METRICS_PREFIX_ENV)
121140
.unwrap_or_else(|_| FRONTEND_METRIC_PREFIX.to_string());
141+
let prefix = sanitize_prometheus_prefix(&raw_prefix);
142+
if prefix != raw_prefix {
143+
tracing::warn!(
144+
raw=%raw_prefix,
145+
sanitized=%prefix,
146+
env=%METRICS_PREFIX_ENV,
147+
"Sanitized HTTP metrics prefix"
148+
);
149+
}
122150
let frontend_metric_name = |suffix: &str| format!("{}_{}", &prefix, suffix);
123151

124152
let request_counter = IntCounterVec::new(

lib/llm/tests/http_metrics.rs

Lines changed: 73 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,37 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
// SPDX-License-Identifier: Apache-2.0
3-
//
4-
// Licensed under the Apache License, Version 2.0 (the "License");
5-
// you may not use this file except in compliance with the License.
6-
// You may obtain a copy of the License at
7-
//
8-
// http://www.apache.org/licenses/LICENSE-2.0
9-
//
10-
// Unless required by applicable law or agreed to in writing, software
11-
// distributed under the License is distributed on an "AS IS" BASIS,
12-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
// See the License for the specific language governing permissions and
14-
// limitations under the License.
153

164
use dynamo_llm::http::service::metrics::{self, Endpoint};
175
use dynamo_llm::http::service::service_v2::HttpService;
186
use dynamo_runtime::CancellationToken;
7+
use serial_test::serial;
198
use std::{env, time::Duration};
209

2110
#[tokio::test]
11+
#[serial]
2212
async fn metrics_prefix_default_then_env_override() {
2313
// Case 1: default prefix
2414
env::remove_var(metrics::METRICS_PREFIX_ENV);
2515
let svc1 = HttpService::builder().port(9101).build().unwrap();
2616
let token1 = CancellationToken::new();
2717
let _h1 = svc1.spawn(token1.clone()).await;
28-
tokio::time::sleep(Duration::from_millis(100)).await;
18+
wait_for_metrics_ready(9101).await;
2919

3020
// Populate labeled metrics
3121
let s1 = svc1.state_clone();
32-
{ let _g = s1.metrics_clone().create_inflight_guard("test-model", Endpoint::ChatCompletions, false); }
33-
let body1 = reqwest::get("http://localhost:9101/metrics").await.unwrap().text().await.unwrap();
22+
{
23+
let _g = s1.metrics_clone().create_inflight_guard(
24+
"test-model",
25+
Endpoint::ChatCompletions,
26+
false,
27+
);
28+
}
29+
let body1 = reqwest::get("http://localhost:9101/metrics")
30+
.await
31+
.unwrap()
32+
.text()
33+
.await
34+
.unwrap();
3435
assert!(body1.contains("dynamo_frontend_requests_total"));
3536
token1.cancel();
3637

@@ -39,12 +40,65 @@ async fn metrics_prefix_default_then_env_override() {
3940
let svc2 = HttpService::builder().port(9102).build().unwrap();
4041
let token2 = CancellationToken::new();
4142
let _h2 = svc2.spawn(token2.clone()).await;
42-
tokio::time::sleep(Duration::from_millis(100)).await;
43+
wait_for_metrics_ready(9102).await;
4344

4445
// Populate labeled metrics
4546
let s2 = svc2.state_clone();
46-
{ let _g = s2.metrics_clone().create_inflight_guard("test-model", Endpoint::ChatCompletions, true); }
47-
let body2 = reqwest::get("http://localhost:9102/metrics").await.unwrap().text().await.unwrap();
48-
assert!(body2.contains("custom_prefix"));
47+
{
48+
let _g =
49+
s2.metrics_clone()
50+
.create_inflight_guard("test-model", Endpoint::ChatCompletions, true);
51+
}
52+
// Single fetch and assert
53+
let body2 = reqwest::get("http://localhost:9102/metrics")
54+
.await
55+
.unwrap()
56+
.text()
57+
.await
58+
.unwrap();
59+
assert!(body2.contains("custom_prefix_requests_total"));
60+
assert!(!body2.contains("dynamo_frontend_requests_total"));
4961
token2.cancel();
62+
63+
// Case 3: invalid env prefix is sanitized
64+
env::set_var(metrics::METRICS_PREFIX_ENV, "nv-llm/http service");
65+
let svc3 = HttpService::builder().port(9103).build().unwrap();
66+
let token3 = CancellationToken::new();
67+
let _h3 = svc3.spawn(token3.clone()).await;
68+
wait_for_metrics_ready(9103).await;
69+
70+
let s3 = svc3.state_clone();
71+
{
72+
let _g =
73+
s3.metrics_clone()
74+
.create_inflight_guard("test-model", Endpoint::ChatCompletions, true);
75+
}
76+
let body3 = reqwest::get("http://localhost:9103/metrics")
77+
.await
78+
.unwrap()
79+
.text()
80+
.await
81+
.unwrap();
82+
assert!(body3.contains("nv_llm_http_service_requests_total"));
83+
assert!(!body3.contains("dynamo_frontend_requests_total"));
84+
token3.cancel();
85+
86+
// Cleanup env to avoid leaking state
87+
env::remove_var(metrics::METRICS_PREFIX_ENV);
88+
}
89+
90+
// Poll /metrics until ready or timeout
91+
async fn wait_for_metrics_ready(port: u16) {
92+
let url = format!("http://localhost:{}/metrics", port);
93+
let start = tokio::time::Instant::now();
94+
let timeout = Duration::from_secs(5);
95+
loop {
96+
if start.elapsed() > timeout {
97+
panic!("Timed out waiting for metrics endpoint at {}", url);
98+
}
99+
match reqwest::get(&url).await {
100+
Ok(resp) if resp.status().is_success() => break,
101+
_ => tokio::time::sleep(Duration::from_millis(50)).await,
102+
}
103+
}
50104
}

0 commit comments

Comments
 (0)