feat: support sglang in sla planner (#2421)

tedzhouhk · coderabbitai[bot] · web-flow · commit e22f84ab183e · 2025-08-12T18:43:13.000-07:00
Signed-off-by: Hongkuan Zhou &lt;tedzhouhk@gmail.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -58,8 +58,8 @@ Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLa
 | [**Disaggregated Serving**](/docs/architecture/disagg_serving.md) | ✅ | ✅ | ✅ |
 | [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | 🚧 | 🚧 |
 | [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md) | ✅ | ✅ | ✅ |
-| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | ✅ | 🚧 | 🚧 |
-| [**Load Based Planner**](/docs/architecture/load_planner.md) | ✅ | 🚧 | 🚧 |
+| [**Load Based Planner**](/docs/architecture/load_planner.md) | 🚧 | 🚧 | 🚧 |
+| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | ✅ | ✅ | 🚧 |
 | [**KVBM**](/docs/architecture/kvbm_architecture.md) | 🚧 | 🚧 | 🚧 |
 
 To learn more about each framework and their capabilities, check out each framework's README!
diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
@@ -37,7 +37,7 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
 | [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | ✅ |  |
 | [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) |
 | [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | ✅ |  |
-| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | ❌ | Planned |
+| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | ✅ |  |
 | [**Load Based Planner**](../../../docs/architecture/load_planner.md) | ❌ | Planned |
 | [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | ❌ | Planned |
 
@@ -197,7 +197,7 @@ curl localhost:8000/v1/chat/completions \
         "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
     }
     ],
-    "stream": false,
+    "stream": true,
     "max_tokens": 30
   }'
 ```
diff --git a/components/backends/sglang/deploy/disagg_planner.yaml b/components/backends/sglang/deploy/disagg_planner.yaml
@@ -0,0 +1,267 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-disagg-planner
+  annotations:
+    nvidia.com/enable-grove: "false"
+spec:
+  envs:
+    - name: DYNAMO_SERVICE_CONFIG
+      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["sglang-disagg-planner-frontend:8000"]}]}]}}'
+    - name: DYNAMO_NAMESPACE
+      value: "dynamo"
+  services:
+    Frontend:
+      dynamoNamespace: dynamo
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8000
+        initialDelaySeconds: 20
+        periodSeconds: 5
+        timeoutSeconds: 5
+        failureThreshold: 3
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "10Gi"
+        limits:
+          cpu: "32"
+          memory: "40Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
+          workingDir: /workspace/components/backends/sglang
+          command: ["sh", "-c"]
+          args:
+            - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000"
+    Planner:
+      dynamoNamespace: dynamo
+      envFromSecret: hf-token-secret
+      componentType: planner
+      replicas: 1
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "2"
+          memory: "2Gi"
+        limits:
+          cpu: "8"
+          memory: "16Gi"
+      pvc:
+        create: false
+        name: profiling-pvc # Must be pre-created before deployment and SLA profiler must have been run
+        mountPoint: /workspace/profiling_results
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
+          workingDir: /workspace/components/planner/src/dynamo/planner
+          args:
+            - python
+            - -m
+            - planner_sla
+            - --environment=kubernetes
+            - --backend=sglang
+            - --adjustment-interval=60
+            - --profile-results-dir=/workspace/profiling_results
+    Prometheus:
+      dynamoNamespace: dynamo
+      componentType: main
+      replicas: 1
+      envs:
+        - name: PYTHONPATH
+          value: "/workspace/components/planner/src"
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        initialDelaySeconds: 30
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "2"
+          memory: "2Gi"
+        limits:
+          cpu: "8"
+          memory: "16Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
+          workingDir: /workspace/components/backends/sglang
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "python3 -m dynamo.planner.prometheus"
+    SGLangDecodeWorker:
+      dynamoNamespace: dynamo
+      envFromSecret: hf-token-secret
+      livenessProbe:
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
+        timeoutSeconds: 30
+        failureThreshold: 1
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 9090
+        periodSeconds: 10
+        timeoutSeconds: 30
+        failureThreshold: 60
+      componentType: worker
+      replicas: 2
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "32"
+          memory: "80Gi"
+          gpu: "1"
+      envs:
+        - name: DYN_SYSTEM_ENABLED
+          value: "true"
+        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
+          value: "[\"generate\"]"
+        - name: DYN_SYSTEM_PORT
+          value: "9090"
+      extraPodSpec:
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /live
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
+          workingDir: /workspace/components/backends/sglang
+          args:
+            - "python3"
+            - "-m"
+            - "dynamo.sglang.decode_worker"
+            - "--model-path"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--served-model-name"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--page-size"
+            - "16"
+            - "--tp"
+            - "1"
+            - "--trust-remote-code"
+            - "--skip-tokenizer-init"
+            - "--disaggregation-mode"
+            - "decode"
+            - "--disaggregation-transfer-backend"
+            - "nixl"
+    SGLangPrefillWorker:
+      dynamoNamespace: dynamo
+      envFromSecret: hf-token-secret
+      livenessProbe:
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
+        timeoutSeconds: 30
+        failureThreshold: 1
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 9090
+        periodSeconds: 10
+        timeoutSeconds: 30
+        failureThreshold: 60
+      componentType: worker
+      replicas: 2
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "32"
+          memory: "80Gi"
+          gpu: "1"
+      envs:
+        - name: DYN_SYSTEM_ENABLED
+          value: "true"
+        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
+          value: "[\"generate\"]"
+        - name: DYN_SYSTEM_PORT
+          value: "9090"
+      extraPodSpec:
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
+          image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
+          workingDir: /workspace/components/backends/sglang
+          args:
+            - "python3"
+            - "-m"
+            - "dynamo.sglang.worker"
+            - "--model-path"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--served-model-name"
+            - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+            - "--page-size"
+            - "16"
+            - "--tp"
+            - "1"
+            - "--trust-remote-code"
+            - "--skip-tokenizer-init"
+            - "--disaggregation-mode"
+            - "prefill"
+            - "--disaggregation-transfer-backend"
+            - "nixl"
diff --git a/components/planner/src/dynamo/planner/planner_sla.py b/components/planner/src/dynamo/planner/planner_sla.py
@@ -62,7 +62,7 @@ async def generate(request: RequestType):
     parser.add_argument(
         "--backend",
         default=SLAPlannerDefaults.backend,
-        choices=["vllm"],
+        choices=["vllm", "sglang"],
         help="Backend type",
     )
     parser.add_argument(
diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang
@@ -231,6 +231,20 @@ ARG CARGO_BUILD_JOBS
 # which might exceed the number of opened files limit.
 ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
 
+# Install prometheus
+ARG PROM_VERSION=3.4.1
+RUN ARCH=$(dpkg --print-architecture) && \
+    case "$ARCH" in \
+        amd64) PLATFORM=linux-amd64 ;; \
+        arm64) PLATFORM=linux-arm64 ;; \
+        *) echo "Unsupported architecture: $ARCH" && exit 1 ;; \
+    esac && \
+    curl -fsSL https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.${PLATFORM}.tar.gz \
+    | tar -xz -C /tmp && \
+    mv /tmp/prometheus-${PROM_VERSION}.${PLATFORM}/prometheus /usr/local/bin/ && \
+    chmod +x /usr/local/bin/prometheus && \
+    rm -rf /tmp/prometheus-${PROM_VERSION}.${PLATFORM}
+
 #######################################
 ########## Local Development ##########
 #######################################
diff --git a/docs/guides/dynamo_deploy/sla_planner_deployment.md b/docs/guides/dynamo_deploy/sla_planner_deployment.md
@@ -1,6 +1,6 @@
 # SLA Planner Deployment Guide
 
-Quick deployment guide for the vLLM disaggregated planner with automatic scaling.
+Quick deployment guide for the disaggregated planner with automatic scaling.
 
 > [!NOTE]
 > For high-level architecture and concepts, see [SLA-based Planner](../../architecture/sla_planner.md).
@@ -34,9 +34,11 @@ export NAMESPACE=your-namespace
 
 ## 1. Deploy the System
 
+We use vllm as the backend engine in this guide. SLA planner also supports SGLang and will support TensorRT-LLM. Checkout `disagg_planner.yaml` in their example deployment folders for more details. The deployment is the same for all backends.
+
 ```bash
 # Apply the disaggregated planner deployment
-kubectl apply -f components/backends/vllm/deploy/disagg_planner.yaml -n $NAMESPACE
+kubectl apply -f components/backends/vllm/deploy/disagg_planner.yaml -n $NAMESPACE # for vllm
 
 # Check deployment status
 kubectl get pods -n $NAMESPACE

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ async def generate(request: RequestType):`
`62`	`62`	`parser.add_argument(`
`63`	`63`	`"--backend",`
`64`	`64`	`default=SLAPlannerDefaults.backend,`
`65`		`- choices=["vllm"],`
	`65`	`+ choices=["vllm", "sglang"],`
`66`	`66`	`help="Backend type",`
`67`	`67`	`)`
`68`	`68`	`parser.add_argument(`