diff --git a/components/backends/sglang/deploy/agg.yaml b/components/backends/sglang/deploy/agg.yaml index b030716a11..639726a605 100644 --- a/components/backends/sglang/deploy/agg.yaml +++ b/components/backends/sglang/deploy/agg.yaml @@ -42,7 +42,7 @@ spec: workingDir: /workspace/components/backends/sglang command: ["sh", "-c"] args: - - "python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo && python3 -m dynamo.frontend" + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg && python3 -m dynamo.frontend --http-port=8000" SGLangDecodeWorker: envFromSecret: hf-token-secret livenessProbe: diff --git a/components/backends/sglang/deploy/agg_router.yaml b/components/backends/sglang/deploy/agg_router.yaml new file mode 100644 index 0000000000..ab0a5bbce6 --- /dev/null +++ b/components/backends/sglang/deploy/agg_router.yaml @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg-router +spec: + services: + Frontend: + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-agg-router + componentType: main + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + command: ["sh", "-c"] + args: + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg-router && python3 -m dynamo.frontend --http-port=8000 --router-mode kv" + SGLangDecodeWorker: + envFromSecret: hf-token-secret + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-agg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--page-size" + - "16" + - "--tp" + - "1" + - "--trust-remote-code" + - "--skip-tokenizer-init" diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml new file mode 100644 index 0000000000..06c4b842d2 --- /dev/null +++ b/components/backends/sglang/deploy/disagg.yaml @@ -0,0 +1,155 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-disagg +spec: + services: + Frontend: + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-disagg + componentType: main + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + command: ["sh", "-c"] + args: + - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000" + SGLangDecodeWorker: + envFromSecret: hf-token-secret + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-disagg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--page-size" + - "16" + - "--tp" + - "1" + - "--trust-remote-code" + - "--skip-tokenizer-init" + - "--disaggregation-mode" + - "decode" + - "--disaggregation-transfer-backend" + - "nixl" + SGLangPrefillWorker: + envFromSecret: hf-token-secret + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + dynamoNamespace: sglang-disagg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--page-size" + - "16" + - "--tp" + - "1" + - "--trust-remote-code" + - "--skip-tokenizer-init" + - "--disaggregation-mode" + - "prefill" + - "--disaggregation-transfer-backend" + - "nixl" \ No newline at end of file diff --git a/components/backends/sglang/launch/agg.sh b/components/backends/sglang/launch/agg.sh index 94153ba9cb..62029416db 100755 --- a/components/backends/sglang/launch/agg.sh +++ b/components/backends/sglang/launch/agg.sh @@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo # run ingress -dynamo run in=http out=dyn --http-port=8000 & +python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! # run worker diff --git a/components/backends/sglang/launch/disagg.sh b/components/backends/sglang/launch/disagg.sh index fecd9cdab4..e5d569f0a7 100755 --- a/components/backends/sglang/launch/disagg.sh +++ b/components/backends/sglang/launch/disagg.sh @@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo # run ingress -dynamo run in=http out=dyn --http-port=8000 & +python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! # run prefill worker diff --git a/components/backends/sglang/launch/disagg_dp_attn.sh b/components/backends/sglang/launch/disagg_dp_attn.sh index 5e6eb43252..9fa04feffe 100755 --- a/components/backends/sglang/launch/disagg_dp_attn.sh +++ b/components/backends/sglang/launch/disagg_dp_attn.sh @@ -15,7 +15,7 @@ trap cleanup EXIT INT TERM python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo # run ingress -dynamo run in=http out=dyn --http-port=8000 & +python3 -m dynamo.frontend --http-port=8000 & DYNAMO_PID=$! # run prefill worker @@ -33,7 +33,7 @@ python3 -m dynamo.sglang.worker \ PREFILL_PID=$! # run decode worker -CUDA_VISIBLE_DEVICES=2,3 python3 dynamo.sglang.decode_worker \ +CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang.decode_worker \ --model-path silence09/DeepSeek-R1-Small-2layers \ --served-model-name silence09/DeepSeek-R1-Small-2layers \ --tp 2 \