Skip to content

Commit 157714a

Browse files
authored
chore: add instructions to modify SLA to profile_sla doc; update component name (#2167)
1 parent 8248a11 commit 157714a

File tree

6 files changed

+93
-47
lines changed

6 files changed

+93
-47
lines changed

benchmarks/profiler/deploy/profile_sla_job.yaml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,8 @@ spec:
1414
image: ${DOCKER_IMAGE}
1515
resources:
1616
requests:
17-
cpu: "1"
18-
memory: "2Gi"
19-
limits:
20-
cpu: "2"
21-
memory: "4Gi"
17+
cpu: "16"
18+
memory: "10Gi"
2219
env:
2320
- name: HUGGING_FACE_HUB_TOKEN
2421
valueFrom:
@@ -37,6 +34,18 @@ spec:
3734
- /workspace/profiling_results
3835
- --namespace
3936
- ${NAMESPACE}
37+
- --min-num-gpus-per-engine
38+
- "1"
39+
- --max-num-gpus-per-engine
40+
- "8"
41+
- --isl
42+
- "3000"
43+
- --osl
44+
- "150"
45+
- --ttft
46+
- "200"
47+
- --itl
48+
- "20"
4049
volumeMounts:
4150
- name: output-volume
4251
mountPath: /workspace/profiling_results

benchmarks/profiler/utils/config.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -89,16 +89,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
8989
if target == "prefill":
9090
# convert prefill worker into decode worker
9191
config["spec"]["services"][
92-
WORKER_COMPONENT_NAMES["vllm"].decode_worker
92+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
9393
] = config["spec"]["services"][
94-
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
94+
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
9595
]
9696
del config["spec"]["services"][
97-
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
97+
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
9898
]
9999

100100
args = config["spec"]["services"][
101-
WORKER_COMPONENT_NAMES["vllm"].decode_worker
101+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
102102
]["extraPodSpec"]["mainContainer"]["args"]
103103

104104
args = break_arguments(args)
@@ -112,18 +112,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
112112
if "--no-enable-prefix-caching" not in args:
113113
args = append_argument(args, "--no-enable-prefix-caching")
114114

115-
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
116-
"extraPodSpec"
117-
]["mainContainer"]["args"] = join_arguments(args)
115+
config["spec"]["services"][
116+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
117+
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
118118

119119
elif target == "decode":
120120
# delete prefill worker
121121
del config["spec"]["services"][
122-
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
122+
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
123123
]
124124

125125
args = config["spec"]["services"][
126-
WORKER_COMPONENT_NAMES["vllm"].decode_worker
126+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
127127
]["extraPodSpec"]["mainContainer"]["args"]
128128

129129
args = break_arguments(args)
@@ -134,13 +134,13 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
134134
if "--no-enable-prefix-caching" in args:
135135
args.remove("--no-enable-prefix-caching")
136136

137-
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
138-
"extraPodSpec"
139-
]["mainContainer"]["args"] = join_arguments(args)
137+
config["spec"]["services"][
138+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
139+
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
140140

141141
# set num workers to 1
142142
decode_worker_config = config["spec"]["services"][
143-
WORKER_COMPONENT_NAMES["vllm"].decode_worker
143+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
144144
]
145145
decode_worker_config["replicas"] = 1
146146

@@ -150,16 +150,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
150150
def set_config_tp_size(cls, config: dict, tp_size: int):
151151
config = deepcopy(config)
152152

153-
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
154-
"resources"
155-
]["requests"]["gpu"] = str(tp_size)
156-
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
157-
"resources"
158-
]["limits"]["gpu"] = str(tp_size)
153+
config["spec"]["services"][
154+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
155+
]["resources"]["requests"]["gpu"] = str(tp_size)
156+
config["spec"]["services"][
157+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
158+
]["resources"]["limits"]["gpu"] = str(tp_size)
159159

160-
args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
161-
"extraPodSpec"
162-
]["mainContainer"]["args"]
160+
args = config["spec"]["services"][
161+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
162+
]["extraPodSpec"]["mainContainer"]["args"]
163163

164164
args = break_arguments(args)
165165

@@ -169,15 +169,15 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
169169
except ValueError:
170170
args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
171171

172-
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
173-
"extraPodSpec"
174-
]["mainContainer"]["args"] = join_arguments(args)
172+
config["spec"]["services"][
173+
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
174+
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
175175

176176
return config
177177

178178
@classmethod
179179
def get_model_name(cls, config: dict) -> str:
180-
worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker
180+
worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
181181
args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
182182
"args"
183183
]

components/backends/vllm/deploy/disagg_planner.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ spec:
141141
- -c
142142
args:
143143
- "python3 -m dynamo.planner.prometheus"
144-
backend:
144+
VllmDecodeWorker:
145145
dynamoNamespace: vllm-disagg-planner
146146
envFromSecret: hf-token-secret
147147
componentType: worker
@@ -191,7 +191,7 @@ spec:
191191
- -c
192192
args:
193193
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
194-
prefill:
194+
VllmPrefillWorker:
195195
dynamoNamespace: vllm-disagg-planner
196196
envFromSecret: hf-token-secret
197197
componentType: worker

components/planner/src/dynamo/planner/defaults.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
7474

7575

7676
class VllmComponentName:
77-
prefill_worker = "prefill"
77+
prefill_worker_k8s_name = "VllmPrefillWorker"
78+
prefill_worker_component_name = "prefill"
7879
prefill_worker_endpoint = "generate"
79-
decode_worker = "backend"
80+
decode_worker_k8s_name = "VllmDecodeWorker"
81+
decode_worker_component_name = "backend"
8082
decode_worker_endpoint = "generate"
8183

8284

components/planner/src/dynamo/planner/utils/planner_core.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,11 @@ async def get_workers_info(self):
106106
if self.prefill_client is None:
107107
self.prefill_client = (
108108
await self.runtime.namespace(self.namespace)
109-
.component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker)
109+
.component(
110+
WORKER_COMPONENT_NAMES[
111+
self.args.backend
112+
].prefill_worker_component_name
113+
)
110114
.endpoint(
111115
WORKER_COMPONENT_NAMES[
112116
self.args.backend
@@ -127,7 +131,11 @@ async def get_workers_info(self):
127131
if self.workers_client is None:
128132
self.workers_client = (
129133
await self.runtime.namespace(self.namespace)
130-
.component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker)
134+
.component(
135+
WORKER_COMPONENT_NAMES[
136+
self.args.backend
137+
].decode_worker_component_name
138+
)
131139
.endpoint(
132140
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint
133141
)
@@ -300,8 +308,12 @@ async def make_adjustments(self):
300308

301309
if not self.args.no_operation:
302310
target_replicas = {
303-
WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p,
304-
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d,
311+
WORKER_COMPONENT_NAMES[
312+
self.args.backend
313+
].prefill_worker_k8s_name: next_num_p,
314+
WORKER_COMPONENT_NAMES[
315+
self.args.backend
316+
].decode_worker_k8s_name: next_num_d,
305317
}
306318
await self.connector.set_component_replicas(target_replicas, blocking=False)
307319

docs/architecture/pre_deployment_profiling.md

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
8282
# in the project's root folder
8383
./container/build.sh --framework VLLM
8484
# Tag and push to your container registry
85+
export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage
86+
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
87+
# Modify this yaml to profile different models
88+
export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
8589
```
8690

8791
Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed.
8892

89-
**Step 2: Run profiling (required)**
93+
**Step 2: Set SLA target**
94+
95+
Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL.
96+
97+
```yaml
98+
spec:
99+
template:
100+
spec:
101+
containers:
102+
- name: profile-sla
103+
args:
104+
- --isl
105+
- "3000" # average ISL is 3000 tokens
106+
- --osl
107+
- "150" # average OSL is 150 tokens
108+
- --ttft
109+
- "200" # target TTFT is 200ms
110+
- --itl
111+
- "20" # target ITL is 20ms
112+
```
113+
114+
**Step 3: Run profiling (required)**
115+
90116
```bash
91117
cd $DYNAMO_HOME/benchmarks/profiler/deploy
92118
envsubst < profiling_pvc.yaml | kubectl apply -f -
93119
envsubst < profile_sla_sa.yaml | kubectl apply -f -
94120
envsubst < profile_sla_rbac.yaml | kubectl apply -f -
95121
envsubst < profile_sla_binding.yaml | kubectl apply -f -
96-
97-
export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own image
98-
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
99-
export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
100122
envsubst < profile_sla_job.yaml | kubectl apply -f -
101123
```
102124

103-
**Step 3: Wait for profiling to complete**
125+
**Step 4: Wait for profiling to complete**
104126
```bash
105127
kubectl get jobs -n $NAMESPACE
106128
kubectl logs job/profile-sla -n $NAMESPACE
@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
129151

130152
1. **Create a temporary pod to access the PVC:**
131153
```bash
132-
kubectl run temp-access --image=alpine:latest --rm -it --restart=Never \
133-
--overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["sh"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
154+
kubectl run temp-access --image=alpine:latest --restart=Never \
155+
--overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
134156
-n $NAMESPACE
135157
```
136158

137159
2. **Inside the temporary pod, navigate to the results directory:**
138160
```bash
161+
kubectl exec -it temp-access -n $NAMESPACE -- sh
139162
cd /workspace/profiling_results
140163
ls -la
141164
```

0 commit comments

Comments
 (0)