ai-dynamo · tedzhouhk · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
@@ -14,11 +14,8 @@ spec:
         image: ${DOCKER_IMAGE}
         resources:
           requests:
-            cpu: "1"
-            memory: "2Gi"
-          limits:
-            cpu: "2"
-            memory: "4Gi"
+            cpu: "16"
+            memory: "10Gi"
         env:
           - name: HUGGING_FACE_HUB_TOKEN
             valueFrom:
@@ -37,6 +34,18 @@ spec:
           - /workspace/profiling_results
           - --namespace
           - ${NAMESPACE}
+          - --min-num-gpus-per-engine
+          - "1"
+          - --max-num-gpus-per-engine
+          - "8"
+          - --isl
+          - "3000"
+          - --osl
+          - "150"
+          - --ttft
+          - "200"
+          - --itl
+          - "20"
         volumeMounts:
           - name: output-volume
             mountPath: /workspace/profiling_results

@@ -89,16 +89,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
         if target == "prefill":
             # convert prefill worker into decode worker
             config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
             ] = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
             ]
             del config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
             ]
 
             args = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
             ]["extraPodSpec"]["mainContainer"]["args"]
 
             args = break_arguments(args)
@@ -112,18 +112,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
             if "--no-enable-prefix-caching" not in args:
                 args = append_argument(args, "--no-enable-prefix-caching")
 
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
-                "extraPodSpec"
-            ]["mainContainer"]["args"] = join_arguments(args)
+            config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
+            ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
 
         elif target == "decode":
             # delete prefill worker
             del config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
             ]
 
             args = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
             ]["extraPodSpec"]["mainContainer"]["args"]
 
             args = break_arguments(args)
@@ -134,13 +134,13 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
             if "--no-enable-prefix-caching" in args:
                 args.remove("--no-enable-prefix-caching")
 
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
-                "extraPodSpec"
-            ]["mainContainer"]["args"] = join_arguments(args)
+            config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
+            ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
 
         # set num workers to 1
         decode_worker_config = config["spec"]["services"][
-            WORKER_COMPONENT_NAMES["vllm"].decode_worker
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
         ]
         decode_worker_config["replicas"] = 1
 
@@ -150,16 +150,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
     def set_config_tp_size(cls, config: dict, tp_size: int):
         config = deepcopy(config)
 
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
-            "resources"
-        ]["requests"]["gpu"] = str(tp_size)
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
-            "resources"
-        ]["limits"]["gpu"] = str(tp_size)
+        config["spec"]["services"][
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
+        ]["resources"]["requests"]["gpu"] = str(tp_size)
+        config["spec"]["services"][
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
+        ]["resources"]["limits"]["gpu"] = str(tp_size)
 
-        args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
-            "extraPodSpec"
-        ]["mainContainer"]["args"]
+        args = config["spec"]["services"][
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
+        ]["extraPodSpec"]["mainContainer"]["args"]
 
         args = break_arguments(args)
 
@@ -169,15 +169,15 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
         except ValueError:
             args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
 
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
-            "extraPodSpec"
-        ]["mainContainer"]["args"] = join_arguments(args)
+        config["spec"]["services"][
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
+        ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
 
         return config
 
     @classmethod
     def get_model_name(cls, config: dict) -> str:
-        worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker
+        worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
         args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
             "args"
         ]

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -141,7 +141,7 @@ spec:
             - -c
           args:
             - "python3 -m dynamo.planner.prometheus"
-    backend:
+    VllmDecodeWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
@@ -191,7 +191,7 @@ spec:
             - -c
           args:
             - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
-    prefill:
+    VllmPrefillWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker

@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
 
 
 class VllmComponentName:
-    prefill_worker = "prefill"
+    prefill_worker_k8s_name = "VllmPrefillWorker"
+    prefill_worker_component_name = "prefill"
     prefill_worker_endpoint = "generate"
-    decode_worker = "backend"
+    decode_worker_k8s_name = "VllmDecodeWorker"
+    decode_worker_component_name = "backend"
     decode_worker_endpoint = "generate"
 
 

@@ -106,7 +106,11 @@ async def get_workers_info(self):
             if self.prefill_client is None:
                 self.prefill_client = (
                     await self.runtime.namespace(self.namespace)
-                    .component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker)
+                    .component(
+                        WORKER_COMPONENT_NAMES[
+                            self.args.backend
+                        ].prefill_worker_component_name
+                    )
                     .endpoint(
                         WORKER_COMPONENT_NAMES[
                             self.args.backend
@@ -127,7 +131,11 @@ async def get_workers_info(self):
             if self.workers_client is None:
                 self.workers_client = (
                     await self.runtime.namespace(self.namespace)
-                    .component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker)
+                    .component(
+                        WORKER_COMPONENT_NAMES[
+                            self.args.backend
+                        ].decode_worker_component_name
+                    )
                     .endpoint(
                         WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint
                     )
@@ -300,8 +308,12 @@ async def make_adjustments(self):
 
         if not self.args.no_operation:
             target_replicas = {
-                WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p,
-                WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d,
+                WORKER_COMPONENT_NAMES[
+                    self.args.backend
+                ].prefill_worker_k8s_name: next_num_p,
+                WORKER_COMPONENT_NAMES[
+                    self.args.backend
+                ].decode_worker_k8s_name: next_num_d,
             }
             await self.connector.set_component_replicas(target_replicas, blocking=False)
 

diff --git a/docs/architecture/pre_deployment_profiling.md b/docs/architecture/pre_deployment_profiling.md
@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
 # in the project's root folder
 ./container/build.sh --framework VLLM
 # Tag and push to your container registry
+export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage
+# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
+# Modify this yaml to profile different models
+export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
 ```
 
 Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed.
 
-**Step 2: Run profiling (required)**
+**Step 2: Set SLA target**
+
+Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL.
+
+```yaml
+spec:
+  template:
+    spec:
+      containers:
+        - name: profile-sla
+          args:
+            - --isl
+            - "3000" # average ISL is 3000 tokens
+            - --osl
+            - "150" # average OSL is 150 tokens
+            - --ttft
+            - "200" # target TTFT is 200ms
+            - --itl
+            - "20" # target ITL is 20ms
+```
+
+**Step 3: Run profiling (required)**
+
 ```bash
 cd $DYNAMO_HOME/benchmarks/profiler/deploy
 envsubst < profiling_pvc.yaml | kubectl apply -f -
 envsubst < profile_sla_sa.yaml | kubectl apply -f -
 envsubst < profile_sla_rbac.yaml | kubectl apply -f -
 envsubst < profile_sla_binding.yaml | kubectl apply -f -
-
-export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own image
-# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
-export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
 envsubst < profile_sla_job.yaml | kubectl apply -f -
 ```
 
-**Step 3: Wait for profiling to complete**
+**Step 4: Wait for profiling to complete**
 ```bash
 kubectl get jobs -n $NAMESPACE
 kubectl logs job/profile-sla -n $NAMESPACE
@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
 
 1. **Create a temporary pod to access the PVC:**
    ```bash
-   kubectl run temp-access --image=alpine:latest --rm -it --restart=Never \
-     --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["sh"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
+   kubectl run temp-access --image=alpine:latest --restart=Never \
+     --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
      -n $NAMESPACE
    ```
 
 2. **Inside the temporary pod, navigate to the results directory:**
    ```bash
+   kubectl exec -it temp-access -n $NAMESPACE -- sh
    cd /workspace/profiling_results
    ls -la
    ```