From 32e5b17295dfd1557eef6a9796922b4bfdaadef8 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Sat, 26 Jul 2025 15:11:16 -0700 Subject: [PATCH 1/3] feat: add disagg examples --- components/backends/sglang/deploy/agg.yaml | 2 +- .../backends/sglang/deploy/agg_router.yaml | 96 +++++++++++ components/backends/sglang/deploy/disagg.yaml | 155 ++++++++++++++++++ 3 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 components/backends/sglang/deploy/agg_router.yaml create mode 100644 components/backends/sglang/deploy/disagg.yaml diff --git a/components/backends/sglang/deploy/agg.yaml b/components/backends/sglang/deploy/agg.yaml index b030716a11..65ee0b87df 100644 --- a/components/backends/sglang/deploy/agg.yaml +++ b/components/backends/sglang/deploy/agg.yaml @@ -42,7 +42,7 @@ spec: workingDir: /workspace/components/backends/sglang command: ["sh", "-c"] args: - - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && python3 -m dynamo.frontend" + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && dynamo run in=http out=dyn --http-port=8000" SGLangDecodeWorker: envFromSecret: hf-token-secret livenessProbe: diff --git a/components/backends/sglang/deploy/agg_router.yaml b/components/backends/sglang/deploy/agg_router.yaml new file mode 100644 index 0000000000..93b2b44e91 --- /dev/null +++ b/components/backends/sglang/deploy/agg_router.yaml @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg-router +spec: + services: + Frontend: + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-agg-router + componentType: main + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + command: ["sh", "-c"] + args: + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && dynamo run in=http out=dyn --router-mode kv --http-port=8000" + SGLangDecodeWorker: + envFromSecret: hf-token-secret + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-agg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--page-size" + - "16" + - "--tp" + - "1" + - "--trust-remote-code" + - "--skip-tokenizer-init" diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml new file mode 100644 index 0000000000..b15033a846 --- /dev/null +++ b/components/backends/sglang/deploy/disagg.yaml @@ -0,0 +1,155 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg +spec: + services: + Frontend: + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-disagg + componentType: main + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + command: ["sh", "-c"] + args: + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && dynamo run in=http out=dyn --http-port=8000" + SGLangDecodeWorker: + envFromSecret: hf-token-secret + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-disagg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--page-size" + - "16" + - "--tp" + - "1" + - "--trust-remote-code" + - "--skip-tokenizer-init" + - "--disaggregation-mode" + - "decode" + - "--disaggregation-transfer-backend" + - "nixl" + SGLangPrefillWorker: + envFromSecret: hf-token-secret + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-disagg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--page-size" + - "16" + - "--tp" + - "1" + - "--trust-remote-code" + - "--skip-tokenizer-init" + - "--disaggregation-mode" + - "prefill" + - "--disaggregation-transfer-backend" + - "nixl" \ No newline at end of file From b7f2e9e3428d27259904f5196fea9faf99576985 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Sat, 26 Jul 2025 23:20:26 -0700 Subject: [PATCH 2/3] move from dynamo-run to dynamo.frontend --- components/backends/sglang/deploy/agg.yaml | 2 +- components/backends/sglang/deploy/disagg.yaml | 2 +- components/backends/sglang/launch/agg.sh | 2 +- components/backends/sglang/launch/disagg.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/backends/sglang/deploy/agg.yaml b/components/backends/sglang/deploy/agg.yaml index 65ee0b87df..d1ca0c554f 100644 --- a/components/backends/sglang/deploy/agg.yaml +++ b/components/backends/sglang/deploy/agg.yaml @@ -42,7 +42,7 @@ spec: workingDir: /workspace/components/backends/sglang command: ["sh", "-c"] args: - - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && dynamo run in=http out=dyn --http-port=8000" + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && python3 -m dynamo.frontend --http-port=8000" SGLangDecodeWorker: envFromSecret: hf-token-secret livenessProbe: diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml index b15033a846..b46dce231d 100644 --- a/components/backends/sglang/deploy/disagg.yaml +++ b/components/backends/sglang/deploy/disagg.yaml @@ -42,7 +42,7 @@ spec: workingDir: /workspace/components/backends/sglang command: ["sh", "-c"] args: - - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && dynamo run in=http out=dyn --http-port=8000" + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && python3 -m dynamo.frontend --http-port=8000" SGLangDecodeWorker: envFromSecret: hf-token-secret livenessProbe: diff --git a/components/backends/sglang/launch/agg.sh b/components/backends/sglang/launch/agg.sh index 94153ba9cb..62029416db 100755 --- a/components/backends/sglang/launch/agg.sh +++ b/components/backends/sglang/launch/agg.sh @@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo # run ingress -dynamo run in=http out=dyn --http-port=8000 & +python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! # run worker diff --git a/components/backends/sglang/launch/disagg.sh b/components/backends/sglang/launch/disagg.sh index fecd9cdab4..e5d569f0a7 100755 --- a/components/backends/sglang/launch/disagg.sh +++ b/components/backends/sglang/launch/disagg.sh @@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo # run ingress -dynamo run in=http out=dyn --http-port=8000 & +python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! # run prefill worker From 68f22add4aa9c4c62c6d96571fbcf7c14c0b3b94 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Sun, 27 Jul 2025 15:47:25 -0700 Subject: [PATCH 3/3] fix: sglang examples --- components/backends/sglang/deploy/agg.yaml | 2 +- components/backends/sglang/deploy/agg_router.yaml | 2 +- components/backends/sglang/deploy/disagg.yaml | 4 ++-- components/backends/sglang/launch/disagg_dp_attn.sh | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/components/backends/sglang/deploy/agg.yaml b/components/backends/sglang/deploy/agg.yaml index d1ca0c554f..639726a605 100644 --- a/components/backends/sglang/deploy/agg.yaml +++ b/components/backends/sglang/deploy/agg.yaml @@ -42,7 +42,7 @@ spec: workingDir: /workspace/components/backends/sglang command: ["sh", "-c"] args: - - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && python3 -m dynamo.frontend --http-port=8000" + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg && python3 -m dynamo.frontend --http-port=8000" SGLangDecodeWorker: envFromSecret: hf-token-secret livenessProbe: diff --git a/components/backends/sglang/deploy/agg_router.yaml b/components/backends/sglang/deploy/agg_router.yaml index 93b2b44e91..ab0a5bbce6 100644 --- a/components/backends/sglang/deploy/agg_router.yaml +++ b/components/backends/sglang/deploy/agg_router.yaml @@ -42,7 +42,7 @@ spec: workingDir: /workspace/components/backends/sglang command: ["sh", "-c"] args: - - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && dynamo run in=http out=dyn --router-mode kv --http-port=8000" + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg-router && python3 -m dynamo.frontend --http-port=8000 --router-mode kv" SGLangDecodeWorker: envFromSecret: hf-token-secret livenessProbe: diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml index b46dce231d..06c4b842d2 100644 --- a/components/backends/sglang/deploy/disagg.yaml +++ b/components/backends/sglang/deploy/disagg.yaml @@ -4,7 +4,7 @@ apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: - name: sglang-agg + name: sglang-disagg spec: services: Frontend: @@ -42,7 +42,7 @@ spec: workingDir: /workspace/components/backends/sglang command: ["sh", "-c"] args: - - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && python3 -m dynamo.frontend --http-port=8000" + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000" SGLangDecodeWorker: envFromSecret: hf-token-secret livenessProbe: diff --git a/components/backends/sglang/launch/disagg_dp_attn.sh b/components/backends/sglang/launch/disagg_dp_attn.sh index 5e6eb43252..9fa04feffe 100755 --- a/components/backends/sglang/launch/disagg_dp_attn.sh +++ b/components/backends/sglang/launch/disagg_dp_attn.sh @@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo # run ingress -dynamo run in=http out=dyn --http-port=8000 & +python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! # run prefill worker @@ -33,7 +33,7 @@ python3 -m dynamo.sglang.worker \ PREFILL_PID=$! # run decode worker -CUDA_VISIBLE_DEVICES=2,3 python3 dynamo.sglang.decode_worker \ +CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang.decode_worker \ --model-path silence09/DeepSeek-R1-Small-2layers \ --served-model-name silence09/DeepSeek-R1-Small-2layers \ --tp 2 \