Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions benchmarks/profiler/deploy/profile_sla_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@ spec:
image: ${DOCKER_IMAGE}
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "2"
memory: "4Gi"
cpu: "16"
memory: "10Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
Expand All @@ -37,6 +34,18 @@ spec:
- /workspace/profiling_results
- --namespace
- ${NAMESPACE}
- --min-num-gpus-per-engine
- "1"
- --max-num-gpus-per-engine
- "8"
- --isl
- "3000"
- --osl
- "150"
- --ttft
- "200"
- --itl
- "20"
volumeMounts:
- name: output-volume
mountPath: /workspace/profiling_results
Expand Down
52 changes: 26 additions & 26 deletions benchmarks/profiler/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
if target == "prefill":
# convert prefill worker into decode worker
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
] = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
]
del config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
]

args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"]

args = break_arguments(args)
Expand All @@ -112,18 +112,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
if "--no-enable-prefix-caching" not in args:
args = append_argument(args, "--no-enable-prefix-caching")

config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)

elif target == "decode":
# delete prefill worker
del config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
]

args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"]

args = break_arguments(args)
Expand All @@ -134,13 +134,13 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
if "--no-enable-prefix-caching" in args:
args.remove("--no-enable-prefix-caching")

config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)

# set num workers to 1
decode_worker_config = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]
decode_worker_config["replicas"] = 1

Expand All @@ -150,16 +150,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
def set_config_tp_size(cls, config: dict, tp_size: int):
config = deepcopy(config)

config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"resources"
]["requests"]["gpu"] = str(tp_size)
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"resources"
]["limits"]["gpu"] = str(tp_size)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["resources"]["requests"]["gpu"] = str(tp_size)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["resources"]["limits"]["gpu"] = str(tp_size)

args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"]
args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"]

args = break_arguments(args)

Expand All @@ -169,15 +169,15 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
except ValueError:
args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])

config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)

return config

@classmethod
def get_model_name(cls, config: dict) -> str:
worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker
worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
"args"
]
Expand Down
4 changes: 2 additions & 2 deletions components/backends/vllm/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ spec:
- -c
args:
- "python3 -m dynamo.planner.prometheus"
backend:
VllmDecodeWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
Expand Down Expand Up @@ -191,7 +191,7 @@ spec:
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
prefill:
VllmPrefillWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
Expand Down
6 changes: 4 additions & 2 deletions components/planner/src/dynamo/planner/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):


class VllmComponentName:
prefill_worker = "prefill"
prefill_worker_k8s_name = "VllmPrefillWorker"
prefill_worker_component_name = "prefill"
prefill_worker_endpoint = "generate"
decode_worker = "backend"
decode_worker_k8s_name = "VllmDecodeWorker"
decode_worker_component_name = "backend"
decode_worker_endpoint = "generate"


Expand Down
20 changes: 16 additions & 4 deletions components/planner/src/dynamo/planner/utils/planner_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,11 @@ async def get_workers_info(self):
if self.prefill_client is None:
self.prefill_client = (
await self.runtime.namespace(self.namespace)
.component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker)
.component(
WORKER_COMPONENT_NAMES[
self.args.backend
].prefill_worker_component_name
)
.endpoint(
WORKER_COMPONENT_NAMES[
self.args.backend
Expand All @@ -127,7 +131,11 @@ async def get_workers_info(self):
if self.workers_client is None:
self.workers_client = (
await self.runtime.namespace(self.namespace)
.component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker)
.component(
WORKER_COMPONENT_NAMES[
self.args.backend
].decode_worker_component_name
)
.endpoint(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint
)
Expand Down Expand Up @@ -300,8 +308,12 @@ async def make_adjustments(self):

if not self.args.no_operation:
target_replicas = {
WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p,
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d,
WORKER_COMPONENT_NAMES[
self.args.backend
].prefill_worker_k8s_name: next_num_p,
WORKER_COMPONENT_NAMES[
self.args.backend
].decode_worker_k8s_name: next_num_d,
}
await self.connector.set_component_replicas(target_replicas, blocking=False)

Expand Down
39 changes: 31 additions & 8 deletions docs/architecture/pre_deployment_profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
# in the project's root folder
./container/build.sh --framework VLLM
# Tag and push to your container registry
export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
# Modify this yaml to profile different models
export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
```

Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed.

**Step 2: Run profiling (required)**
**Step 2: Set SLA target**

Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL.

```yaml
spec:
template:
spec:
containers:
- name: profile-sla
args:
- --isl
- "3000" # average ISL is 3000 tokens
- --osl
- "150" # average OSL is 150 tokens
- --ttft
- "200" # target TTFT is 200ms
- --itl
- "20" # target ITL is 20ms
```

**Step 3: Run profiling (required)**

```bash
cd $DYNAMO_HOME/benchmarks/profiler/deploy
envsubst < profiling_pvc.yaml | kubectl apply -f -
envsubst < profile_sla_sa.yaml | kubectl apply -f -
envsubst < profile_sla_rbac.yaml | kubectl apply -f -
envsubst < profile_sla_binding.yaml | kubectl apply -f -

export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own image
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
envsubst < profile_sla_job.yaml | kubectl apply -f -
```

**Step 3: Wait for profiling to complete**
**Step 4: Wait for profiling to complete**
```bash
kubectl get jobs -n $NAMESPACE
kubectl logs job/profile-sla -n $NAMESPACE
Expand Down Expand Up @@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r

1. **Create a temporary pod to access the PVC:**
```bash
kubectl run temp-access --image=alpine:latest --rm -it --restart=Never \
--overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["sh"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
kubectl run temp-access --image=alpine:latest --restart=Never \
--overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
-n $NAMESPACE
```

2. **Inside the temporary pod, navigate to the results directory:**
```bash
kubectl exec -it temp-access -n $NAMESPACE -- sh
cd /workspace/profiling_results
ls -la
```
Expand Down
Loading