Skip to content

Commit ce36d9f

Browse files
hhzhang16zhongdaor-nvtmonty12keivenchangkthui
authored
feat: allow in-cluster perf benchmarks with a kubectl one-liner (#3144)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Signed-off-by: zhongdaor <zhongdaor@nvidia.com> Signed-off-by: tmontfort <tmontfort@nvidia.com> Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com> Signed-off-by: richardhuo-nv <rihuo@nvidia.com> Signed-off-by: Tushar Sharma <tusharma@nvidia.com> Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com> Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com> Signed-off-by: Olga Andreeva <oandreeva@nvidia.com> Signed-off-by: oandreeva-nv <oandreeva-nv@nvidia.com> Co-authored-by: zhongdaor-nv <zhongdaor@nvidia.com> Co-authored-by: Thomas Montfort <61255722+tmonty12@users.noreply.github.com> Co-authored-by: Keiven C <213854356+keivenchang@users.noreply.github.com> Co-authored-by: Jacky <18255193+kthui@users.noreply.github.com> Co-authored-by: Richard Huo <rihuo@nvidia.com> Co-authored-by: Tushar Sharma <tusharma@nvidia.com> Co-authored-by: Tzu-Ling Kan <tzulingk@nvidia.com> Co-authored-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Co-authored-by: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Co-authored-by: Ziqi Fan <ziqif@nvidia.com> Co-authored-by: oandreeva-nv <oandreeva-nv@nvidia.com>
1 parent c433447 commit ce36d9f

File tree

6 files changed

+377
-36
lines changed

6 files changed

+377
-36
lines changed

benchmarks/incluster/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../docs/benchmarks/benchmarking.md#server-side-benchmarking-in-cluster
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: batch/v1
5+
kind: Job
6+
metadata:
7+
name: dynamo-benchmark
8+
spec:
9+
template:
10+
spec:
11+
serviceAccountName: dynamo-sa
12+
imagePullSecrets:
13+
- name: docker-imagepullsecret
14+
securityContext:
15+
runAsNonRoot: true
16+
runAsUser: 1000
17+
fsGroup: 1000
18+
containers:
19+
- name: benchmark-runner
20+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
21+
securityContext:
22+
allowPrivilegeEscalation: false
23+
capabilities:
24+
drop:
25+
- ALL
26+
readOnlyRootFilesystem: true
27+
runAsNonRoot: true
28+
resources:
29+
requests:
30+
cpu: "4"
31+
memory: "8Gi"
32+
limits:
33+
cpu: "8"
34+
memory: "16Gi"
35+
env:
36+
- name: HUGGING_FACE_HUB_TOKEN
37+
valueFrom:
38+
secretKeyRef:
39+
name: hf-token-secret
40+
key: HF_TOKEN
41+
command: ["python3", "-m", "benchmarks.utils.benchmark"]
42+
args:
43+
- --model
44+
- "Qwen/Qwen3-0.6B"
45+
- --isl
46+
- "2000"
47+
- --std
48+
- "10"
49+
- --osl
50+
- "256"
51+
- --output-dir
52+
- /data/results
53+
- --input
54+
- "qwen-vllm-agg=vllm-agg-frontend:8000"
55+
# add more copies of lines 58-59 for each additional service if you want to benchmark multiple services
56+
# - --input
57+
# - "name=service-url:port"
58+
volumeMounts:
59+
- name: data-volume
60+
mountPath: /data
61+
restartPolicy: Never
62+
volumes:
63+
- name: data-volume
64+
persistentVolumeClaim:
65+
claimName: dynamo-pvc
66+
backoffLimit: 0
67+
ttlSecondsAfterFinished: 3600 # Clean up job after 1 hour

benchmarks/utils/benchmark.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,53 +7,59 @@
77
import re
88
import sys
99
from typing import Dict, Tuple
10+
from urllib.parse import urlsplit
1011

11-
from benchmarks.utils.workflow import run_benchmark_workflow
12+
from benchmarks.utils.workflow import has_http_scheme, run_benchmark_workflow
13+
from deploy.utils.kubernetes import is_running_in_cluster
1214

1315

1416
def validate_inputs(inputs: Dict[str, str]) -> None:
15-
"""Validate that all inputs are HTTP endpoints"""
17+
"""Validate that all inputs are HTTP endpoints or internal service URLs when running in cluster"""
1618
for label, value in inputs.items():
17-
if not value.lower().startswith(("http://", "https://")):
18-
raise ValueError(
19-
f"Input '{label}' must be an HTTP endpoint (starting with http:// or https://). Got: {value}"
20-
)
19+
v = value.strip()
20+
if is_running_in_cluster():
21+
# Allow HTTP(S) or internal service URLs like host[:port][/path]
22+
if has_http_scheme(v):
23+
pass
24+
else:
25+
parts = urlsplit(f"//{v}")
26+
host_ok = bool(parts.hostname)
27+
port_ok = parts.port is None or (1 <= parts.port <= 65535)
28+
if not (host_ok and port_ok):
29+
raise ValueError(
30+
f"Input '{label}' must be HTTP(S) or internal service URL. Got: {value}"
31+
)
32+
else:
33+
if not has_http_scheme(v):
34+
raise ValueError(f"Input '{label}' must be HTTP endpoint. Got: {value}")
2135

2236
# Validate reserved labels
2337
if label.lower() == "plots":
24-
raise ValueError(
25-
"Label 'plots' is reserved and cannot be used. Please choose a different label."
26-
)
38+
raise ValueError("Label 'plots' is reserved")
2739

2840

2941
def parse_input(input_str: str) -> Tuple[str, str]:
3042
"""Parse input string in format key=value with additional validation"""
3143
if "=" not in input_str:
32-
raise ValueError(
33-
f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
34-
)
44+
raise ValueError(f"Invalid input format: {input_str}")
3545

3646
parts = input_str.split("=", 1) # Split on first '=' only
3747
if len(parts) != 2:
38-
raise ValueError(
39-
f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
40-
)
48+
raise ValueError(f"Invalid input format: {input_str}")
4149

4250
label, value = parts
4351

4452
if not label.strip():
45-
raise ValueError("Label cannot be empty")
53+
raise ValueError("Empty label")
4654
if not value.strip():
47-
raise ValueError("Value cannot be empty")
55+
raise ValueError("Empty value")
4856

4957
label = label.strip()
5058
value = value.strip()
5159

5260
# Validate label characters
5361
if not re.match(r"^[a-zA-Z0-9_-]+$", label):
54-
raise ValueError(
55-
f"Label must contain only letters, numbers, hyphens, and underscores. Invalid label: {label}"
56-
)
62+
raise ValueError(f"Invalid label: {label}")
5763

5864
return label, value
5965

@@ -114,7 +120,7 @@ def main() -> int:
114120
)
115121
print()
116122

117-
# Validate that all inputs are HTTP endpoints
123+
# Validate that inputs are HTTP endpoints or in-cluster service URLs
118124
validate_inputs(parsed_inputs)
119125

120126
except ValueError as e:

benchmarks/utils/workflow.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,21 @@
66

77
from benchmarks.utils.genai import run_concurrency_sweep
88
from benchmarks.utils.plot import generate_plots
9+
from deploy.utils.kubernetes import is_running_in_cluster
10+
11+
12+
def has_http_scheme(url: str) -> bool:
13+
"""Check if URL has HTTP or HTTPS scheme."""
14+
return url.lower().startswith(("http://", "https://"))
15+
16+
17+
def normalize_service_url(endpoint: str) -> str:
18+
e = endpoint.strip()
19+
if has_http_scheme(e):
20+
return e
21+
if is_running_in_cluster():
22+
return f"http://{e}"
23+
return e # Outside cluster, validation will have ensured scheme is present
924

1025

1126
def print_concurrency_start(
@@ -30,15 +45,18 @@ def run_endpoint_benchmark(
3045
output_dir: Path,
3146
) -> None:
3247
"""Run benchmark for an existing endpoint with custom label"""
33-
print(f"🚀 Starting benchmark of endpoint '{label}': {endpoint}")
48+
# Normalize endpoint to a usable URL (handles in-cluster scheme-less inputs)
49+
service_url = normalize_service_url(endpoint)
50+
51+
print(f"🚀 Starting benchmark of endpoint '{label}': {service_url}")
3452
print(f"📁 Results will be saved to: {output_dir / label}")
3553
print_concurrency_start(label, model, isl, osl, std)
3654

3755
# Create output directory
3856
(output_dir / label).mkdir(parents=True, exist_ok=True)
3957

4058
run_concurrency_sweep(
41-
service_url=endpoint,
59+
service_url=service_url,
4260
model_name=model,
4361
isl=isl,
4462
osl=osl,
@@ -73,7 +91,7 @@ def run_benchmark_workflow(
7391
model: str = "Qwen/Qwen3-0.6B",
7492
output_dir: str = "benchmarks/results",
7593
) -> None:
76-
"""Main benchmark workflow orchestrator for HTTP endpoints only"""
94+
"""Main benchmark workflow orchestrator for HTTP endpoints (and in-cluster internal service URLs)"""
7795
output_dir_path = Path(output_dir)
7896
output_dir_path.mkdir(parents=True, exist_ok=True)
7997

deploy/utils/kubernetes.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,22 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import os
1617
import subprocess
1718
import sys
1819
from pathlib import Path
1920
from typing import List
2021

2122
PVC_ACCESS_POD_NAME = "pvc-access-pod"
2223

24+
K8S_SA_TOKEN = Path("/var/run/secrets/kubernetes.io/serviceaccount/token")
25+
26+
27+
def is_running_in_cluster() -> bool:
28+
"""Return True if running inside a Kubernetes cluster."""
29+
# Prefer well-known env var; fall back to SA token presence
30+
return bool(os.environ.get("KUBERNETES_SERVICE_HOST")) or K8S_SA_TOKEN.exists()
31+
2332

2433
def run_command(
2534
cmd: List[str], capture_output: bool = True, exit_on_error: bool = True

0 commit comments

Comments
 (0)