feat: allow in-cluster perf benchmarks with a kubectl one-liner (#3144)

hhzhang16 · zhongdaor-nv · tmonty12 · web-flow · commit ce36d9f4ce98 · 2025-09-23T10:03:09.000-07:00
Signed-off-by: Hannah Zhang &lt;hannahz@nvidia.com&gt;
Signed-off-by: zhongdaor &lt;zhongdaor@nvidia.com&gt;
Signed-off-by: tmontfort &lt;tmontfort@nvidia.com&gt;
Signed-off-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
Signed-off-by: Jacky &lt;18255193+kthui@users.noreply.github.com&gt;
Signed-off-by: richardhuo-nv &lt;rihuo@nvidia.com&gt;
Signed-off-by: Tushar Sharma &lt;tusharma@nvidia.com&gt;
Signed-off-by: tzulingk@nvidia.com &lt;tzulingk@nvidia.com&gt;
Signed-off-by: Krishnan Prashanth &lt;kprashanth@nvidia.com&gt;
Signed-off-by: Olga Andreeva &lt;oandreeva@nvidia.com&gt;
Signed-off-by: oandreeva-nv &lt;oandreeva-nv@nvidia.com&gt;
Co-authored-by: zhongdaor-nv &lt;zhongdaor@nvidia.com&gt;
Co-authored-by: Thomas Montfort &lt;61255722+tmonty12@users.noreply.github.com&gt;
Co-authored-by: Keiven C &lt;213854356+keivenchang@users.noreply.github.com&gt;
Co-authored-by: Jacky &lt;18255193+kthui@users.noreply.github.com&gt;
Co-authored-by: Richard Huo &lt;rihuo@nvidia.com&gt;
Co-authored-by: Tushar Sharma &lt;tusharma@nvidia.com&gt;
Co-authored-by: Tzu-Ling Kan &lt;tzulingk@nvidia.com&gt;
Co-authored-by: KrishnanPrash &lt;140860868+KrishnanPrash@users.noreply.github.com&gt;
Co-authored-by: Olga Andreeva &lt;124622579+oandreeva-nv@users.noreply.github.com&gt;
Co-authored-by: Ziqi Fan &lt;ziqif@nvidia.com&gt;
Co-authored-by: oandreeva-nv &lt;oandreeva-nv@nvidia.com&gt;
diff --git a/benchmarks/incluster/README.md b/benchmarks/incluster/README.md
@@ -0,0 +1 @@
+../../docs/benchmarks/benchmarking.md#server-side-benchmarking-in-cluster
diff --git a/benchmarks/incluster/benchmark_job.yaml b/benchmarks/incluster/benchmark_job.yaml
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: dynamo-benchmark
+spec:
+  template:
+    spec:
+      serviceAccountName: dynamo-sa
+      imagePullSecrets:
+      - name: docker-imagepullsecret
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        fsGroup: 1000
+      containers:
+      - name: benchmark-runner
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - ALL
+          readOnlyRootFilesystem: true
+          runAsNonRoot: true
+        resources:
+          requests:
+            cpu: "4"
+            memory: "8Gi"
+          limits:
+            cpu: "8"
+            memory: "16Gi"
+        env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: HF_TOKEN
+        command: ["python3", "-m", "benchmarks.utils.benchmark"]
+        args:
+          - --model
+          - "Qwen/Qwen3-0.6B"
+          - --isl
+          - "2000"
+          - --std
+          - "10"
+          - --osl
+          - "256"
+          - --output-dir
+          - /data/results
+          - --input
+          - "qwen-vllm-agg=vllm-agg-frontend:8000"
+          # add more copies of lines 58-59 for each additional service if you want to benchmark multiple services
+          # - --input
+          # - "name=service-url:port"
+        volumeMounts:
+          - name: data-volume
+            mountPath: /data
+      restartPolicy: Never
+      volumes:
+        - name: data-volume
+          persistentVolumeClaim:
+            claimName: dynamo-pvc
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 3600  # Clean up job after 1 hour
diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py
@@ -7,53 +7,59 @@
 import re
 import sys
 from typing import Dict, Tuple
+from urllib.parse import urlsplit
 
-from benchmarks.utils.workflow import run_benchmark_workflow
+from benchmarks.utils.workflow import has_http_scheme, run_benchmark_workflow
+from deploy.utils.kubernetes import is_running_in_cluster
 
 
 def validate_inputs(inputs: Dict[str, str]) -> None:
-    """Validate that all inputs are HTTP endpoints"""
+    """Validate that all inputs are HTTP endpoints or internal service URLs when running in cluster"""
     for label, value in inputs.items():
-        if not value.lower().startswith(("http://", "https://")):
-            raise ValueError(
-                f"Input '{label}' must be an HTTP endpoint (starting with http:// or https://). Got: {value}"
-            )
+        v = value.strip()
+        if is_running_in_cluster():
+            # Allow HTTP(S) or internal service URLs like host[:port][/path]
+            if has_http_scheme(v):
+                pass
+            else:
+                parts = urlsplit(f"//{v}")
+                host_ok = bool(parts.hostname)
+                port_ok = parts.port is None or (1 <= parts.port <= 65535)
+                if not (host_ok and port_ok):
+                    raise ValueError(
+                        f"Input '{label}' must be HTTP(S) or internal service URL. Got: {value}"
+                    )
+        else:
+            if not has_http_scheme(v):
+                raise ValueError(f"Input '{label}' must be HTTP endpoint. Got: {value}")
 
         # Validate reserved labels
         if label.lower() == "plots":
-            raise ValueError(
-                "Label 'plots' is reserved and cannot be used. Please choose a different label."
-            )
+            raise ValueError("Label 'plots' is reserved")
 
 
 def parse_input(input_str: str) -> Tuple[str, str]:
     """Parse input string in format key=value with additional validation"""
     if "=" not in input_str:
-        raise ValueError(
-            f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
-        )
+        raise ValueError(f"Invalid input format: {input_str}")
 
     parts = input_str.split("=", 1)  # Split on first '=' only
     if len(parts) != 2:
-        raise ValueError(
-            f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
-        )
+        raise ValueError(f"Invalid input format: {input_str}")
 
     label, value = parts
 
     if not label.strip():
-        raise ValueError("Label cannot be empty")
+        raise ValueError("Empty label")
     if not value.strip():
-        raise ValueError("Value cannot be empty")
+        raise ValueError("Empty value")
 
     label = label.strip()
     value = value.strip()
 
     # Validate label characters
     if not re.match(r"^[a-zA-Z0-9_-]+$", label):
-        raise ValueError(
-            f"Label must contain only letters, numbers, hyphens, and underscores. Invalid label: {label}"
-        )
+        raise ValueError(f"Invalid label: {label}")
 
     return label, value
 
@@ -114,7 +120,7 @@ def main() -> int:
             )
             print()
 
-        # Validate that all inputs are HTTP endpoints
+        # Validate that inputs are HTTP endpoints or in-cluster service URLs
         validate_inputs(parsed_inputs)
 
     except ValueError as e:
diff --git a/benchmarks/utils/workflow.py b/benchmarks/utils/workflow.py
@@ -6,6 +6,21 @@
 
 from benchmarks.utils.genai import run_concurrency_sweep
 from benchmarks.utils.plot import generate_plots
+from deploy.utils.kubernetes import is_running_in_cluster
+
+
+def has_http_scheme(url: str) -> bool:
+    """Check if URL has HTTP or HTTPS scheme."""
+    return url.lower().startswith(("http://", "https://"))
+
+
+def normalize_service_url(endpoint: str) -> str:
+    e = endpoint.strip()
+    if has_http_scheme(e):
+        return e
+    if is_running_in_cluster():
+        return f"http://{e}"
+    return e  # Outside cluster, validation will have ensured scheme is present
 
 
 def print_concurrency_start(
@@ -30,15 +45,18 @@ def run_endpoint_benchmark(
     output_dir: Path,
 ) -> None:
     """Run benchmark for an existing endpoint with custom label"""
-    print(f"🚀 Starting benchmark of endpoint '{label}': {endpoint}")
+    # Normalize endpoint to a usable URL (handles in-cluster scheme-less inputs)
+    service_url = normalize_service_url(endpoint)
+
+    print(f"🚀 Starting benchmark of endpoint '{label}': {service_url}")
     print(f"📁 Results will be saved to: {output_dir / label}")
     print_concurrency_start(label, model, isl, osl, std)
 
     # Create output directory
     (output_dir / label).mkdir(parents=True, exist_ok=True)
 
     run_concurrency_sweep(
-        service_url=endpoint,
+        service_url=service_url,
         model_name=model,
         isl=isl,
         osl=osl,
@@ -73,7 +91,7 @@ def run_benchmark_workflow(
     model: str = "Qwen/Qwen3-0.6B",
     output_dir: str = "benchmarks/results",
 ) -> None:
-    """Main benchmark workflow orchestrator for HTTP endpoints only"""
+    """Main benchmark workflow orchestrator for HTTP endpoints (and in-cluster internal service URLs)"""
     output_dir_path = Path(output_dir)
     output_dir_path.mkdir(parents=True, exist_ok=True)
 
diff --git a/deploy/utils/kubernetes.py b/deploy/utils/kubernetes.py
@@ -13,13 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import subprocess
 import sys
 from pathlib import Path
 from typing import List
 
 PVC_ACCESS_POD_NAME = "pvc-access-pod"
 
+K8S_SA_TOKEN = Path("/var/run/secrets/kubernetes.io/serviceaccount/token")
+
+
+def is_running_in_cluster() -> bool:
+    """Return True if running inside a Kubernetes cluster."""
+    # Prefer well-known env var; fall back to SA token presence
+    return bool(os.environ.get("KUBERNETES_SERVICE_HOST")) or K8S_SA_TOKEN.exists()
+
 
 def run_command(
     cmd: List[str], capture_output: bool = True, exit_on_error: bool = True
diff --git a/docs/benchmarks/benchmarking.md b/docs/benchmarks/benchmarking.md

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../docs/benchmarks/benchmarking.md#server-side-benchmarking-in-cluster`