diff --git a/Cargo.lock b/Cargo.lock index b9a4093e95..5c4b0afd0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4561,9 +4561,9 @@ dependencies = [ [[package]] name = "nixl-sys" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97f621270fd1ed8af5a8028a1945e6f7e612a38836ce82b720fe54222739df3c" +checksum = "743ed1038b386b75451f9e0bba37cb2e3eea75873635268337d6531be99c9303" dependencies = [ "bindgen 0.71.1", "cc", diff --git a/README.md b/README.md index 759a9187d8..4369dc8142 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,30 @@ limitations under the License. [![Discord](https://dcbadge.limes.pink/api/server/D92uqZRjCZ?style=flat)](https://discord.gg/D92uqZRjCZ) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/ai-dynamo/dynamo) -| **[Roadmap](https://github.com/ai-dynamo/dynamo/issues/762)** | **[Documentation](https://docs.nvidia.com/dynamo/latest/index.html)** | **[Examples](https://github.com/ai-dynamo/dynamo/tree/main/examples)** | **[Design Proposals](https://github.com/ai-dynamo/enhancements)** | +| **[Roadmap](https://github.com/ai-dynamo/dynamo/issues/762)** | **[Documentation](https://docs.nvidia.com/dynamo/latest/index.html)** | **[Support Matrix](docs/support_matrix.md)** | **[Examples](https://github.com/ai-dynamo/dynamo/tree/main/examples)** | **[Design Proposals](https://github.com/ai-dynamo/enhancements)** | # NVIDIA Dynamo High-throughput, low-latency inference framework designed for serving generative AI and reasoning models in multi-node distributed environments. +## Framework Support Matrix + +| Feature | vLLM | SGLang | TensorRT-LLM | +|---------|----------------------|----------------------------|----------------------------------------| +| [**Disaggregated Serving**](/docs/architecture/disagg_serving.md) | ✅ | ✅ | ✅ | +| [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | 🚧 | 🚧 | +| [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md) | ✅ | ✅ | ✅ | +| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | ✅ | 🚧 | 🚧 | +| [**Load Based Planner**](/docs/architecture/load_planner.md) | ✅ | 🚧 | 🚧 | +| [**KVBM**](/docs/architecture/kvbm_architecture.md) | 🚧 | 🚧 | 🚧 | + +To learn more about each framework and their capabilities, check out each framework's README and deploy them with Dynamo! +- **[vLLM](components/backends/vllm/README.md)** +- **[SGLang](components/backends/sglang/README.md)** +- **[TensorRT-LLM](components/backends/trtllm/README.md)** + +Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, OSS (Open Source Software) first development approach. + ## The Era of Multi-GPU, Multi-Node

@@ -47,24 +65,6 @@ Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLa Dynamo architecture

-## Framework Support Matrix - -| Feature | vLLM | SGLang | TensorRT-LLM | -|---------|----------------------|----------------------------|----------------------------------------| -| [**Disaggregated Serving**](/docs/architecture/disagg_serving.md) | ✅ | ✅ | ✅ | -| [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | 🚧 | 🚧 | -| [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md) | ✅ | ✅ | ✅ | -| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | ✅ | 🚧 | 🚧 | -| [**Load Based Planner**](/docs/architecture/load_planner.md) | ✅ | 🚧 | 🚧 | -| [**KVBM**](/docs/architecture/kvbm_architecture.md) | 🚧 | 🚧 | 🚧 | - -To learn more about each framework and their capabilities, check out each framework's README! -- **[vLLM](components/backends/vllm/README.md)** -- **[SGLang](components/backends/sglang/README.md)** -- **[TensorRT-LLM](components/backends/trtllm/README.md)** - -Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, OSS (Open Source Software) first development approach. - # Installation The following examples require a few system level packages. @@ -115,11 +115,11 @@ Dynamo provides a simple way to spin up a local set of inference components incl ``` # Start an OpenAI compatible HTTP server, a pre-processor (prompt templating and tokenization) and a router: -python -m dynamo.frontend [--http-port 8080] +python -m dynamo.frontend --http-port 8080 # Start the SGLang engine, connecting to NATS and etcd to receive requests. You can run several of these, # both for the same model and for multiple models. The frontend node will discover them. -python -m dynamo.sglang.worker deepseek-ai/DeepSeek-R1-Distill-Llama-8B +python -m dynamo.sglang.worker --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --skip-tokenizer-init ``` #### Send a Request @@ -167,10 +167,15 @@ To specify which GPUs to use set environment variable `CUDA_VISIBLE_DEVICES`. ## SGLang + ``` -# Install libnuma +# Install libnuma-dev apt install -y libnuma-dev +# Install flashinfer-python pre-release (required by sglang for optimized inference) +uv pip install "flashinfer-python==0.2.9rc2" --prerelease=allow + +# Install ai-dynamo with sglang support uv pip install ai-dynamo[sglang] ``` diff --git a/benchmarks/llm/README.md b/benchmarks/llm/README.md index e0cb8e976d..614dbd9be4 100644 --- a/benchmarks/llm/README.md +++ b/benchmarks/llm/README.md @@ -12,4 +12,3 @@ See the License for the specific language governing permissions and limitations under the License. --> -[../../examples/llm/benchmarks/README.md](../../examples/llm/benchmarks/README.md) diff --git a/components/README.md b/components/README.md index 2c5677eae7..3f638f5371 100644 --- a/components/README.md +++ b/components/README.md @@ -77,4 +77,4 @@ To get started with Dynamo components: 4. **Run deployment scripts** from the engine's launch directory 5. **Monitor performance** using the metrics component -For detailed instructions, see the README files in each component directory and the main [Dynamo documentation](../../docs/). +For detailed instructions, see the README files in each component directory and the main [Dynamo documentation](../docs/). diff --git a/components/backends/llama_cpp/README.md b/components/backends/llama_cpp/README.md index f7c9e6520e..78a553c0c1 100644 --- a/components/backends/llama_cpp/README.md +++ b/components/backends/llama_cpp/README.md @@ -13,7 +13,7 @@ python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args] ## Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend. +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md index ffb58e76a0..705c65d3a8 100644 --- a/components/backends/sglang/README.md +++ b/components/backends/sglang/README.md @@ -34,12 +34,12 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) | Feature | SGLang | Notes | |---------|--------|-------| -| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | ✅ | | -| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) | -| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | ✅ | | -| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | ❌ | Planned | -| [**Load Based Planner**](../../docs/architecture/load_planner.md) | ❌ | Planned | -| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | ❌ | Planned | +| [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | ✅ | | +| [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) | +| [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | ✅ | | +| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | ❌ | Planned | +| [**Load Based Planner**](../../../docs/architecture/load_planner.md) | ❌ | Planned | +| [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | ❌ | Planned | ### Large Scale P/D and WideEP Features @@ -52,8 +52,7 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) ## Quick Start -Below we provide a guide that lets you run all of our the common deployment patterns on a single node. See our different [architectures](../llm/README.md#deployment-architectures) for a high level overview of each pattern and the architecture diagram for each. - +Below we provide a guide that lets you run all of our the common deployment patterns on a single node. ### Start NATS and ETCD in the background Start using [Docker Compose](../../../deploy/docker-compose.yml) @@ -141,7 +140,7 @@ cd $DYNAMO_ROOT/components/backends/sglang ## Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend. +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. @@ -164,7 +163,6 @@ Below we provide a selected list of advanced examples. Please open up an issue i ### Large scale P/D disaggregation with WideEP - **[Run DeepSeek-R1 on 104+ H100s](docs/dsr1-wideep-h100.md)** -- **[Run DeepSeek-R1 on GB200s](docs/dsr1-wideep-gb200.md)** ### Speculative Decoding - **[Deploying DeepSeek-R1 with MTP - coming soon!](.)** diff --git a/components/backends/sglang/deploy/README.md b/components/backends/sglang/deploy/README.md new file mode 100644 index 0000000000..c41b6793ff --- /dev/null +++ b/components/backends/sglang/deploy/README.md @@ -0,0 +1,162 @@ +# SGLang Kubernetes Deployment Configurations + +This directory contains Kubernetes Custom Resource Definition (CRD) templates for deploying SGLang inference graphs using the **DynamoGraphDeployment** resource. + +## Available Deployment Patterns + +### 1. **Aggregated Deployment** (`agg.yaml`) +Basic deployment pattern with frontend and a single decode worker. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server +- `SGLangDecodeWorker`: Single worker handling both prefill and decode + +### 2. **Aggregated Router Deployment** (`agg_router.yaml`) +Enhanced aggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server with router mode enabled (`--router-mode kv`) +- `SGLangDecodeWorker`: Single worker handling both prefill and decode + +### 3. **Disaggregated Deployment** (`disagg.yaml`)** +High-performance deployment with separated prefill and decode workers. + +**Architecture:** +- `Frontend`: HTTP API server coordinating between workers +- `SGLangDecodeWorker`: Specialized decode-only worker (`--disaggregation-mode decode`) +- `SGLangPrefillWorker`: Specialized prefill-only worker (`--disaggregation-mode prefill`) +- Communication via NIXL transfer backend (`--disaggregation-transfer-backend nixl`) + +## CRD Structure + +All templates use the **DynamoGraphDeployment** CRD: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: +spec: + services: + : + # Service configuration +``` + +### Key Configuration Options + +**Resource Management:** +```yaml +resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" +``` + +**Container Configuration:** +```yaml +extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + # Model-specific arguments +``` + +## Prerequisites + +Before using these templates, ensure you have: + +1. **Dynamo Cloud Platform installed** - See [Installing Dynamo Cloud](../../docs/guides/dynamo_deploy/dynamo_cloud.md) +2. **Kubernetes cluster with GPU support** +3. **Container registry access** for SGLang runtime images +4. **HuggingFace token secret** (referenced as `envFromSecret: hf-token-secret`) + +## Usage + +### 1. Choose Your Template +Select the deployment pattern that matches your requirements: +- Use `agg.yaml` for development/testing +- Use `agg_router.yaml` for production with load balancing +- Use `disagg.yaml` for maximum performance + +### 2. Customize Configuration +Edit the template to match your environment: + +```yaml +# Update image registry and tag +image: your-registry/sglang-runtime:your-tag + +# Configure your model +args: + - "--model-path" + - "your-org/your-model" + - "--served-model-name" + - "your-org/your-model" +``` + +### 3. Deploy + +Use the following command to deploy the deployment file. + +First, create a secret for the HuggingFace token. +```bash +export HF_TOKEN=your_hf_token +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + +Then, deploy the model using the deployment file. + +```bash +export DEPLOYMENT_FILE=agg.yaml +kubectl apply -f $DEPLOYMENT_FILE -n ${NAMESPACE} +``` + +### 4. Using Custom Dynamo Frameworks Image for SGLang + +To use a custom dynamo frameworks image for SGLang, you can update the deployment file using yq: + +```bash +export DEPLOYMENT_FILE=agg.yaml +export FRAMEWORK_RUNTIME_IMAGE= + +yq '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE > $DEPLOYMENT_FILE.generated +kubectl apply -f $DEPLOYMENT_FILE.generated -n $NAMESPACE +``` + +## Model Configuration + +All templates use **DeepSeek-R1-Distill-Llama-8B** as the default model. But you can use any sglang argument and configuration. Key parameters: + +## Monitoring and Health + +- **Frontend health endpoint**: `http://:8000/health` +- **Liveness probes**: Check process health every 60s + +## Further Reading + +- **Deployment Guide**: [Creating Kubernetes Deployments](../../../../docs/guides/dynamo_deploy/create_deployment.md) +- **Quickstart**: [Deployment Quickstart](../../../../docs/guides/dynamo_deploy/quickstart.md) +- **Platform Setup**: [Dynamo Cloud Installation](../../../../docs/guides/dynamo_deploy/dynamo_cloud.md) +- **Examples**: [Deployment Examples](../../../../docs/examples/README.md) +- **Kubernetes CRDs**: [Custom Resources Documentation](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/) + +## Troubleshooting + +Common issues and solutions: + +1. **Pod fails to start**: Check image registry access and HuggingFace token secret +2. **GPU not allocated**: Verify cluster has GPU nodes and proper resource limits +3. **Health check failures**: Review model loading logs and increase `initialDelaySeconds` +4. **Out of memory**: Increase memory limits or reduce model batch size + +For additional support, refer to the [deployment troubleshooting guide](../../docs/guides/dynamo_deploy/quickstart.md#troubleshooting). diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml index 06c4b842d2..aa90223486 100644 --- a/components/backends/sglang/deploy/disagg.yaml +++ b/components/backends/sglang/deploy/disagg.yaml @@ -83,7 +83,7 @@ spec: args: - "python3" - "-m" - - "dynamo.sglang.worker" + - "dynamo.sglang.decode_worker" - "--model-path" - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" - "--served-model-name" @@ -152,4 +152,4 @@ spec: - "--disaggregation-mode" - "prefill" - "--disaggregation-transfer-backend" - - "nixl" \ No newline at end of file + - "nixl" diff --git a/components/backends/sglang/docs/dsr1-wideep-h100.md b/components/backends/sglang/docs/dsr1-wideep-h100.md index d766bc3edf..6cfcace10d 100644 --- a/components/backends/sglang/docs/dsr1-wideep-h100.md +++ b/components/backends/sglang/docs/dsr1-wideep-h100.md @@ -5,26 +5,18 @@ SPDX-License-Identifier: Apache-2.0 # Running DeepSeek-R1 Disaggregated with WideEP on H100s -Dynamo supports SGLang's implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://www.nvidia.com/en-us/technologies/ai/deepseek-r1-large-scale-p-d-with-wide-expert-parallelism/) for more details. We provide a Dockerfile for this in `container/Dockerfile.sglang-deepep` and configurations to deploy this at scale. In this example, we will run 1 prefill worker on 4 H100 nodes and 1 decode worker on 9 H100 nodes (104 total GPUs). +Dynamo supports SGLang's implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-05-05-large-scale-ep/) for more details. We provide a Dockerfile for this in `container/Dockerfile.sglang-deepep` and configurations to deploy this at scale. In this example, we will run 1 prefill worker on 4 H100 nodes and 1 decode worker on 9 H100 nodes (104 total GPUs). ## Instructions -1. Pull the SGLang container. - -```bash -docker pull lmsysorg/sglang:latest -``` - -You can also pull a specific tag from the [lmsys dockerhub](https://hub.docker.com/r/lmsysorg/sglang/tags) - -2. Build the Dynamo container +1. Build the Dynamo container ```bash cd $DYNAMO_ROOT docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache ``` -3. You can run this container on each 8xH100 node using the following command. +2. You can run this container on each 8xH100 node using the following command. > [!IMPORTANT] > We recommend downloading DeepSeek-R1 and then mounting it to the container. You can find the model [here](https://huggingface.co/deepseek-ai/DeepSeek-R1) @@ -47,19 +39,19 @@ docker run \ In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory. -4. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. +3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. ```bash -./utils/gen_env_vars.sh +./components/backends/sglang/src/dynamo/sglang/utils/gen_env_vars.sh ``` -5. Run the ingress and prefill worker +4. Run the ingress and prefill worker ```bash # run ingress -dynamo run in=http out=dyn & +python3 -m dynamo.frontend --http-port=8000 & # optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below) -python3 utils/sgl_http_server.py --ns dynamo & +python3 -m dynamo.sglang.utils.sgl_http_server --ns dynamo & # run prefill worker python3 -m dynamo.sglang.worker \ --model-path /model/ \ @@ -93,7 +85,7 @@ python3 -m dynamo.sglang.worker \ On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3 -7. Run the decode worker on the head decode node +5. Run the decode worker on the head decode node ```bash python3 -m dynamo.sglang.decode_worker \ @@ -121,7 +113,7 @@ python3 -m dynamo.sglang.decode_worker \ --deepep-mode low_latency \ --mem-fraction-static 0.835 \ --ep-num-redundant-experts 32 \ - --cuda-graph-bs 256 + --cuda-graph-bs 128 ``` On the other decode nodes (this example has 9 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, and 8 @@ -131,6 +123,7 @@ On the other decode nodes (this example has 9 total decode nodes), run the same In the official [blog post repro instructions](https://github.com/sgl-project/sglang/issues/6017), SGL uses batch inference to benchmark their prefill and decode workers. They do this by pretokenizing the ShareGPT dataset and then creating a batch of 8192 requests with ISL 4096 and OSL 5 (for prefill stress test) and a batch of 40000 with ISL 2000 and OSL 100 (for decode stress test). If you want to repro these benchmarks, you will need to add the following flags to the prefill and decode commands: prefill: + ```bash ... --max-running-requests 8192 \ @@ -142,6 +135,7 @@ prefill: ``` decode: + ```bash ... --max-running-requests 18432 \ @@ -152,9 +146,10 @@ decode: We currently provide 2 different ways to perform an end to end benchmark which includes using our OpenAI frontend and tokenization. We will continue to add better support for these sorts of large single batch workloads in the future. 1. **GenAI Perf to benchmark end to end performance with 8k ISL 256 OSL** -We've found that 8k ISL 256 OSL provides a good baseline for measuring end to end disaggregated serving performance for DSR1. As WideEP allows for a higher throughput, we provide a script that runs this workload at high concurrencies. DeepGEMM kernels can sometimes take a while to warm up. We provide a short ramping warmup script that can be used. + We've found that 8k ISL 256 OSL provides a good baseline for measuring end to end disaggregated serving performance for DSR1. As WideEP allows for a higher throughput, we provide a script that runs this workload at high concurrencies. DeepGEMM kernels can sometimes take a while to warm up. We provide a short ramping warmup script that can be used. Example usage: + ```bash # warmup ./utils/bench.sh HEAD_PREFILL_NODE_IP --type warmup @@ -165,9 +160,10 @@ curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache ``` 2. **GenAI Perf to benchmark completions with custom dataset** -We provide a script that generates a JSONL file of the ShareGPT dataset and then use GenAI Perf to benchmark the prefill and decode workers. We use ShareGPT in order to leverage the pre-existing EPLB distributions provided by the SGLang team. If you don't want to use ShareGPT - you can also use GenAIPerf's synthetic dataset setup But note you will have to use dynamic EPLB configurations or record your own as the `init-expert-location` provided by SGLang is tuned specifically for the ShareGPT dataset at a 4096 ISL and 5 OSL. + We provide a script that generates a JSONL file of the ShareGPT dataset and then use GenAI Perf to benchmark the prefill and decode workers. We use ShareGPT in order to leverage the pre-existing EPLB distributions provided by the SGLang team. If you don't want to use ShareGPT - you can also use GenAIPerf's synthetic dataset setup But note you will have to use dynamic EPLB configurations or record your own as the `init-expert-location` provided by SGLang is tuned specifically for the ShareGPT dataset at a 4096 ISL and 5 OSL. Example usage: + ```bash # generate data python3 src/dynamo/sglang/utils/generate_bench_data.py --output data.jsonl --num-prompts 8192 --input-len 4096 --output-len 5 --model deepseek-ai/DeepSeek-R1 diff --git a/components/backends/sglang/docs/multinode-examples.md b/components/backends/sglang/docs/multinode-examples.md index 2bc0a802ff..d6ae5e32e0 100644 --- a/components/backends/sglang/docs/multinode-examples.md +++ b/components/backends/sglang/docs/multinode-examples.md @@ -19,7 +19,7 @@ SGLang allows you to deploy multi-node sized models by adding in the `dist-init- Node 1: Run HTTP ingress, processor, and 8 shards of the prefill worker ```bash # run ingress -dynamo run in=http out=dyn & +python3 -m dynamo.frontend --http-port=8000 & # run prefill worker python3 -m dynamo.sglang.worker \ --model-path /model/ \ @@ -102,7 +102,7 @@ SGLang typically requires a warmup period to ensure the DeepGEMM kernels are loa curl ${HEAD_PREFILL_NODE_IP}:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "model": "deepseek-ai/DeepSeek-R1", "messages": [ { "role": "user", diff --git a/components/backends/sglang/docs/sgl-http-server.md b/components/backends/sglang/docs/sgl-http-server.md index 0d87b760c3..28e2b2400a 100644 --- a/components/backends/sglang/docs/sgl-http-server.md +++ b/components/backends/sglang/docs/sgl-http-server.md @@ -74,7 +74,7 @@ The server accepts the following command-line arguments: Start the server: ```bash -python src/dynamo/sglang/utils/sgl_http_server.py --port 9001 --namespace dynamo +python3 -m dynamo.sglang.utils.sgl_http_server --ns dynamo ``` The server will automatically discover all SGLang components in the specified namespace and provide HTTP endpoints for managing them. diff --git a/components/backends/sglang/launch/agg_router.sh b/components/backends/sglang/launch/agg_router.sh index b45509235c..46a0eff19d 100755 --- a/components/backends/sglang/launch/agg_router.sh +++ b/components/backends/sglang/launch/agg_router.sh @@ -15,7 +15,8 @@ trap cleanup EXIT INT TERM python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo # run ingress -dynamo run in=http out=dyn --router-mode kv --http-port=8000 & +# run ingress +python -m dynamo.frontend --router-mode kv --http-port=8000 & DYNAMO_PID=$! # run worker diff --git a/components/backends/sglang/slurm_jobs/README.md b/components/backends/sglang/slurm_jobs/README.md index 19f7c27ada..7fa454f39c 100644 --- a/components/backends/sglang/slurm_jobs/README.md +++ b/components/backends/sglang/slurm_jobs/README.md @@ -1,10 +1,10 @@ # Example: Deploy Multi-node SGLang with Dynamo on SLURM -This folder implements the example of [SGLang DeepSeek-R1 Disaggregated with WideEP](../dsr1-wideep.md) on a SLURM cluster. +This folder implements the example of [SGLang DeepSeek-R1 Disaggregated with WideEP](../docs/dsr1-wideep-h100.md) on a SLURM cluster. ## Overview -The scripts in this folder set up multiple cluster nodes to run the [SGLang DeepSeek-R1 Disaggregated with WideEP](../dsr1-wideep.md) example, with separate nodes handling prefill and decode. +The scripts in this folder set up multiple cluster nodes to run the [SGLang DeepSeek-R1 Disaggregated with WideEP](../docs/dsr1-wideep-h100.md) example, with separate nodes handling prefill and decode. The node setup is done using Python job submission scripts with Jinja2 templates for flexible configuration. The setup also includes GPU utilization monitoring capabilities to track performance during benchmarks. ## Scripts @@ -56,7 +56,7 @@ For simplicity of the example, we will make some assumptions about your SLURM cl If your cluster supports similar container based plugins, you may be able to modify the template to use that instead. 3. We assume you have already built a recent Dynamo+SGLang container image as - described [here](../dsr1-wideep.md#instructions). + described [here](../docs/dsr1-wideep-h100.md#instructions). This is the image that can be passed to the `--container-image` argument in later steps. ## Usage diff --git a/components/backends/sglang/slurm_jobs/job_script_template.j2 b/components/backends/sglang/slurm_jobs/job_script_template.j2 index 84e0e33396..bbabbe8152 100755 --- a/components/backends/sglang/slurm_jobs/job_script_template.j2 +++ b/components/backends/sglang/slurm_jobs/job_script_template.j2 @@ -54,8 +54,7 @@ echo "Decode host IP address: $DECODE_HOST_IP" ENROOT_ARGS="\ --container-image=${CONTAINER_IMAGE} \ --no-container-entrypoint \ - --container-mount-home \ - --no-container-remap-root \ + --no-container-mount-home \ --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \ " diff --git a/components/backends/sglang/slurm_jobs/scripts/worker_setup.py b/components/backends/sglang/slurm_jobs/scripts/worker_setup.py index db6ac88531..08df3bfb67 100644 --- a/components/backends/sglang/slurm_jobs/scripts/worker_setup.py +++ b/components/backends/sglang/slurm_jobs/scripts/worker_setup.py @@ -206,7 +206,9 @@ def setup_prefill_node( if not etcd_process: raise RuntimeError("Failed to start etcd") - ingress_process = run_command("dynamo run in=http out=dyn", background=True) + ingress_process = run_command( + "python3 -m dynamo.frontend --http-port=8000 &", background=True + ) if not ingress_process: raise RuntimeError("Failed to start ingress") @@ -291,7 +293,7 @@ def setup_decode_node( "--deepep-mode low_latency " "--mem-fraction-static 0.835 " "--ep-num-redundant-experts 32 " - "--cuda-graph-bs 256 " + "--cuda-graph-bs 128 " ) return run_command(dynamo_cmd) diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md index 3a5b495dce..67d63bd8d5 100644 --- a/components/backends/trtllm/README.md +++ b/components/backends/trtllm/README.md @@ -49,12 +49,12 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) | Feature | TensorRT-LLM | Notes | |---------|--------------|-------| -| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | ✅ | | -| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | Not supported yet | -| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | ✅ | | -| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | 🚧 | Planned | -| [**Load Based Planner**](../../docs/architecture/load_planner.md) | 🚧 | Planned | -| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | 🚧 | Planned | +| [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | ✅ | | +| [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | Not supported yet | +| [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | ✅ | | +| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | 🚧 | Planned | +| [**Load Based Planner**](../../../docs/architecture/load_planner.md) | 🚧 | Planned | +| [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | 🚧 | Planned | ### Large Scale P/D and WideEP Features @@ -185,6 +185,22 @@ For comprehensive instructions on multinode serving, see the [multinode-examples ### Speculative Decoding - **[Llama 4 Maverick Instruct + Eagle Speculative Decoding](./llama4_plus_eagle.md)** +### Kubernetes Deployment + +For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [TensorRT-LLM Kubernetes Deployment Guide](deploy/README.md) + +### Client + +See [client](../llm/README.md#client) section to learn how to send request to the deployment. + +NOTE: To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `. + +### Benchmarking + +To benchmark your deployment with GenAI-Perf, see this utility script, configuring the +`model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh) + + ## Disaggregation Strategy The disaggregation strategy controls how requests are distributed between the prefill and decode workers in a disaggregated deployment. @@ -204,7 +220,7 @@ Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disag ## Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend. +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. @@ -220,7 +236,7 @@ The migrated request will continue responding to the original request, allowing ## Client -See [client](../llm/README.md#client) section to learn how to send request to the deployment. +See the [quickstart guide](../../../examples/basics/quickstart/README.md#3-send-requests) to learn how to send request to the deployment. NOTE: To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `. diff --git a/components/backends/trtllm/deploy/README.md b/components/backends/trtllm/deploy/README.md new file mode 100644 index 0000000000..9add8791da --- /dev/null +++ b/components/backends/trtllm/deploy/README.md @@ -0,0 +1,288 @@ +# TensorRT-LLM Kubernetes Deployment Configurations + +This directory contains Kubernetes Custom Resource Definition (CRD) templates for deploying TensorRT-LLM inference graphs using the **DynamoGraphDeployment** resource. + +## Available Deployment Patterns + +### 1. **Aggregated Deployment** (`agg.yaml`) +Basic deployment pattern with frontend and a single worker. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server (with kv router mode disabled) +- `TRTLLMWorker`: Single worker handling both prefill and decode + +### 2. **Aggregated Router Deployment** (`agg_router.yaml`) +Enhanced aggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server (with kv router mode enabled) +- `TRTLLMWorker`: Multiple workers handling both prefill and decode (2 replicas for load balancing) + +### 3. **Disaggregated Deployment** (`disagg.yaml`) +High-performance deployment with separated prefill and decode workers. + +**Architecture:** +- `Frontend`: HTTP API server coordinating between workers +- `TRTLLMDecodeWorker`: Specialized decode-only worker +- `TRTLLMPrefillWorker`: Specialized prefill-only worker + +### 4. **Disaggregated Router Deployment** (`disagg_router.yaml`) +Advanced disaggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: HTTP API server (with kv router mode enabled) +- `TRTLLMDecodeWorker`: Specialized decode-only worker +- `TRTLLMPrefillWorker`: Specialized prefill-only worker (2 replicas for load balancing) + +## CRD Structure + +All templates use the **DynamoGraphDeployment** CRD: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: +spec: + services: + : + # Service configuration +``` + +### Key Configuration Options + +**Resource Management:** +```yaml +resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" +``` + +**Container Configuration:** +```yaml +extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + args: + - "python3" + - "-m" + - "dynamo.trtllm" + # Model-specific arguments +``` + +## Prerequisites + +Before using these templates, ensure you have: + +1. **Dynamo Cloud Platform installed** - See [Quickstart Guide](../../../../docs/guides/dynamo_deploy/quickstart.md) +2. **Kubernetes cluster with GPU support** +3. **Container registry access** for TensorRT-LLM runtime images +4. **HuggingFace token secret** (referenced as `envFromSecret: hf-token-secret`) + +### Container Images + +The deployment files currently require access to `nvcr.io/nvidian/nim-llm-dev/trtllm-runtime`. If you don't have access, build and push your own image: + +```bash +./container/build.sh --framework tensorrtllm +# Tag and push to your container registry +# Update the image references in the YAML files +``` + +**Note:** TensorRT-LLM uses git-lfs, which needs to be installed in advance: +```bash +apt-get update && apt-get -y install git git-lfs +``` + +For ARM machines, use: +```bash +./container/build.sh --framework tensorrtllm --platform linux/arm64 +``` + +## Usage + +### 1. Choose Your Template +Select the deployment pattern that matches your requirements: +- Use `agg.yaml` for simple testing +- Use `agg_router.yaml` for production with KV cache routing and load balancing +- Use `disagg.yaml` for maximum performance with separated workers +- Use `disagg_router.yaml` for high-performance with KV cache routing and disaggregation + +### 2. Customize Configuration +Edit the template to match your environment: + +```yaml +# Update image registry and tag +image: your-registry/trtllm-runtime:your-tag + +# Configure your model and deployment settings +args: + - "python3" + - "-m" + - "dynamo.trtllm" + # Add your model-specific arguments +``` + +### 3. Deploy + +See the [Create Deployment Guide](../../../../docs/guides/dynamo_deploy/create_deployment.md) to learn how to deploy the deployment file. + +First, create a secret for the HuggingFace token. +```bash +export HF_TOKEN=your_hf_token +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + +Then, deploy the model using the deployment file. + +Export the NAMESPACE you used in your Dynamo Cloud Installation. + +```bash +cd dynamo/components/backends/trtllm/deploy +export DEPLOYMENT_FILE=agg.yaml +kubectl apply -f $DEPLOYMENT_FILE -n $NAMESPACE +``` + +### 4. Using Custom Dynamo Frameworks Image for TensorRT-LLM + +To use a custom dynamo frameworks image for TensorRT-LLM, you can update the deployment file using yq: + +```bash +export DEPLOYMENT_FILE=agg.yaml +export FRAMEWORK_RUNTIME_IMAGE= + +yq '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE > $DEPLOYMENT_FILE.generated +kubectl apply -f $DEPLOYMENT_FILE.generated -n $NAMESPACE +``` + +### 5. Port Forwarding + +After deployment, forward the frontend service to access the API: + +```bash +kubectl port-forward deployment/trtllm-v1-disagg-frontend- 8000:8000 +``` + +## Configuration Options + +### Environment Variables + +To change `DYN_LOG` level, edit the yaml file by adding: + +```yaml +... +spec: + envs: + - name: DYN_LOG + value: "debug" # or other log levels + ... +``` + +### TensorRT-LLM Worker Configuration + +TensorRT-LLM workers are configured through command-line arguments in the deployment YAML. Key configuration areas include: + +- **Disaggregation Strategy**: Control request flow with `DISAGGREGATION_STRATEGY` environment variable +- **KV Cache Transfer**: Choose between UCX (default) or NIXL for disaggregated serving +- **Request Migration**: Enable graceful failure handling with `--migration-limit` + +### Disaggregation Strategy + +The disaggregation strategy controls how requests are distributed between prefill and decode workers: + +- **`decode_first`** (default): Requests routed to decode worker first, then forwarded to prefill worker +- **`prefill_first`**: Requests routed directly to prefill worker (used with KV routing) + +Set via environment variable: +```yaml +envs: + - name: DISAGGREGATION_STRATEGY + value: "prefill_first" +``` + +## Testing the Deployment + +Send a test request to verify your deployment. See the [client section](../../../../components/backends/llm/README.md#client) for detailed instructions. + +**Note:** For multi-node deployments, target the node running `python3 -m dynamo.frontend `. + +## Model Configuration + +The deployment templates support various TensorRT-LLM models and configurations. You can customize model-specific arguments in the worker configuration sections of the YAML files. + +### Multi-Token Prediction (MTP) Support + +For models supporting Multi-Token Prediction (such as DeepSeek R1), special configuration is available. Note that MTP requires the experimental TensorRT-LLM commit: + +```bash +./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit +``` + +## Monitoring and Health + +- **Frontend health endpoint**: `http://:8000/health` +- **Worker health endpoints**: `http://:9090/health` +- **Liveness probes**: Check process health every 5 seconds +- **Readiness probes**: Check service readiness with configurable delays + +## KV Cache Transfer Methods + +TensorRT-LLM supports two methods for KV cache transfer in disaggregated serving: + +- **UCX** (default): Standard method for KV cache transfer +- **NIXL** (experimental): Alternative transfer method + +For detailed configuration instructions, see the [KV cache transfer guide](../kv-cache-tranfer.md). + +## Request Migration + +You can enable [request migration](../../../../docs/architecture/request_migration.md) to handle worker failures gracefully by adding the migration limit argument to worker configurations: + +```yaml +args: + - "python3" + - "-m" + - "dynamo.trtllm" + - "--migration-limit" + - "3" +``` + +## Benchmarking + +To benchmark your deployment with GenAI-Perf, see this utility script: [perf.sh](../../../../benchmarks/llm/perf.sh) + +Configure the `model` name and `host` based on your deployment. + +## Further Reading + +- **Deployment Guide**: [Creating Kubernetes Deployments](../../../../docs/guides/dynamo_deploy/create_deployment.md) +- **Quickstart**: [Deployment Quickstart](../../../../docs/guides/dynamo_deploy/quickstart.md) +- **Platform Setup**: [Dynamo Cloud Installation](../../../../docs/guides/dynamo_deploy/dynamo_cloud.md) +- **Examples**: [Deployment Examples](../../../../docs/examples/README.md) +- **Architecture Docs**: [Disaggregated Serving](../../../../docs/architecture/disagg_serving.md), [KV-Aware Routing](../../../../docs/architecture/kv_cache_routing.md) +- **Multinode Deployment**: [Multinode Examples](../multinode/multinode-examples.md) +- **Speculative Decoding**: [Llama 4 + Eagle Guide](../llama4_plus_eagle.md) +- **Kubernetes CRDs**: [Custom Resources Documentation](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/) + +## Troubleshooting + +Common issues and solutions: + +1. **Pod fails to start**: Check image registry access and HuggingFace token secret +2. **GPU not allocated**: Verify cluster has GPU nodes and proper resource limits +3. **Health check failures**: Review model loading logs and increase `initialDelaySeconds` +4. **Out of memory**: Increase memory limits or reduce model batch size +5. **Port forwarding issues**: Ensure correct pod UUID in port-forward command +6. **Git LFS issues**: Ensure git-lfs is installed before building containers +7. **ARM deployment**: Use `--platform linux/arm64` when building on ARM machines + +For additional support, refer to the [deployment troubleshooting guide](../../../../docs/guides/dynamo_deploy/quickstart.md#troubleshooting). diff --git a/components/backends/trtllm/deploy/agg.yaml b/components/backends/trtllm/deploy/agg.yaml new file mode 100644 index 0000000000..3fe9ad54ac --- /dev/null +++ b/components/backends/trtllm/deploy/agg.yaml @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-agg +spec: + services: + Frontend: + dynamoNamespace: trtllm-agg + componentType: main + livenessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 10 + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000" + TRTLLMWorker: + envFromSecret: hf-token-secret + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + dynamoNamespace: trtllm-agg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + args: + - "python3" + - "-m" + - "dynamo.trtllm" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--extra-engine-args" + - "engine_configs/agg.yaml" diff --git a/components/backends/trtllm/deploy/agg_router.yaml b/components/backends/trtllm/deploy/agg_router.yaml new file mode 100644 index 0000000000..116693d90a --- /dev/null +++ b/components/backends/trtllm/deploy/agg_router.yaml @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-agg-router +spec: + services: + Frontend: + livenessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 5 + dynamoNamespace: trtllm-agg-router + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000 --router-mode kv" + TRTLLMWorker: + envFromSecret: hf-token-secret + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + dynamoNamespace: trtllm-agg-router + componentType: worker + replicas: 2 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + args: + - "python3" + - "-m" + - "dynamo.trtllm" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--extra-engine-args" + - "engine_configs/agg.yaml" + - "--publish-events-and-metrics" diff --git a/components/backends/trtllm/deploy/disagg.yaml b/components/backends/trtllm/deploy/disagg.yaml new file mode 100644 index 0000000000..be2eefcd51 --- /dev/null +++ b/components/backends/trtllm/deploy/disagg.yaml @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-disagg +spec: + services: + Frontend: + dynamoNamespace: trtllm-disagg + componentType: main + livenessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 10 + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000" + TRTLLMPrefillWorker: + dynamoNamespace: trtllm-disagg + envFromSecret: hf-token-secret + componentType: worker + replicas: 1 + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" + TRTLLMDecodeWorker: + dynamoNamespace: trtllm-disagg + envFromSecret: hf-token-secret + componentType: worker + replicas: 1 + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" diff --git a/components/backends/trtllm/deploy/disagg_router.yaml b/components/backends/trtllm/deploy/disagg_router.yaml new file mode 100644 index 0000000000..512138cbbf --- /dev/null +++ b/components/backends/trtllm/deploy/disagg_router.yaml @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-v1-disagg-router +spec: + services: + Frontend: + dynamoNamespace: trtllm-v1-disagg-router + componentType: main + livenessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 10 + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000 --router-mode kv" + TRTLLMPrefillWorker: + dynamoNamespace: trtllm-v1-disagg-router + envFromSecret: hf-token-secret + componentType: worker + replicas: 2 + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics 2>&1 | tee /tmp/trtllm.log" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" + TRTLLMDecodeWorker: + dynamoNamespace: trtllm-v1-disagg-router + envFromSecret: hf-token-secret + componentType: worker + replicas: 1 + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first 2>&1 | tee /tmp/trtllm.log" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" diff --git a/components/backends/trtllm/engine_configs/agg.yaml b/components/backends/trtllm/engine_configs/agg.yaml index 02b5cd8463..d349a65756 100644 --- a/components/backends/trtllm/engine_configs/agg.yaml +++ b/components/backends/trtllm/engine_configs/agg.yaml @@ -28,4 +28,7 @@ kv_cache_config: # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -use_cuda_graph: true + + +cuda_graph_config: + max_batch_size: 16 \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/decode.yaml b/components/backends/trtllm/engine_configs/decode.yaml index 3460f6ff80..bafc26d450 100644 --- a/components/backends/trtllm/engine_configs/decode.yaml +++ b/components/backends/trtllm/engine_configs/decode.yaml @@ -16,11 +16,16 @@ tensor_parallel_size: 1 moe_expert_parallel_size: 1 enable_attention_dp: false max_num_tokens: 8192 -max_batch_size: 16 trust_remote_code: true backend: pytorch enable_chunked_prefill: true disable_overlap_scheduler: false -use_cuda_graph: true + +cuda_graph_config: + max_batch_size: 16 + kv_cache_config: free_gpu_memory_fraction: 0.95 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml index f0b5411221..25fae60abf 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml @@ -28,23 +28,24 @@ max_num_tokens: 8448 max_seq_len: 8448 kv_cache_config: free_gpu_memory_fraction: 0.30 + dtype: fp8 # Enable the MTP(Multi-Token Prediction) in the model engine speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true -kv_cache_dtype: fp8 diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml index ab48b2e78b..59b9aabe98 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml @@ -31,23 +31,27 @@ max_num_tokens: 512 max_seq_len: 8704 kv_cache_config: free_gpu_memory_fraction: 0.85 + dtype: fp8 # Enable the MTP(Multi-Token Prediction) in decode model engine speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml index ee6ee26a94..f44bcac141 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml @@ -27,11 +27,15 @@ max_num_tokens: 8192 max_seq_len: 8192 kv_cache_config: free_gpu_memory_fraction: 0.75 + dtype: fp8 + print_iter_log: true -kv_cache_dtype: fp8 disable_overlap_scheduler: true # Enable the MTP(Multi-Token Prediction) in the prefill model engine speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml index 29dddba56f..db2377a92a 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml @@ -31,24 +31,26 @@ kv_cache_config: # With dp attention enabled: large ISL at high concurrency may need # free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction: 0.30 + dtype: fp8 + # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -use_cuda_graph: true -cuda_graph_padding_enabled: true +cuda_graph_config: + enable_padding: true # NOTE: For larger max batch size, you may want to add larger cuda graph # batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true -kv_cache_dtype: fp8 diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml index 772b94b283..73e193c146 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml @@ -31,25 +31,30 @@ kv_cache_config: # With dp attention enabled: large ISL at high concurrency may need # free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction: 0.30 + dtype: fp8 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 disable_overlap_scheduler: false -use_cuda_graph: true -cuda_graph_padding_enabled: true -# NOTE: For larger max batch size, you may want to add larger cuda graph -# batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 + +cuda_graph_config: + enable_padding: true + # NOTE: For larger max batch size, you may want to + # add larger cuda graph batch sizes below to match. + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml index 6ae899a68a..3d6d4d3574 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml @@ -26,6 +26,7 @@ max_seq_len: 8192 kv_cache_config: free_gpu_memory_fraction: 0.75 + dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed @@ -33,5 +34,6 @@ kv_cache_config: # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 disable_overlap_scheduler: true print_iter_log: true -# NOTE: This dtype must match in both prefill/decode configs -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml index d697caacfa..844c4ffa72 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml @@ -10,18 +10,20 @@ enable_attention_dp: true max_batch_size: 256 max_num_tokens: 256 max_seq_len: 8448 + kv_cache_config: free_gpu_memory_fraction: 0.7 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -kv_cache_dtype: fp8 + dtype: fp8 + +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml index 4f2df0aa56..d32aab2dd3 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml @@ -3,14 +3,16 @@ backend: pytorch # WideEP related settings -moe_backend: WideEP -# moe_max_num_tokens will default to max_num_tokens if left unspecified. -# -# If you want to set this value explicitly, one recommendation is below: -# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size -# 4096 = 256 * 16 -# moe_max_num_tokens: 4096 -moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml +moe_config: + backend: WIDEEP + # moe_max_num_tokens will default to max_num_tokens if left unspecified. + # + # If you want to set this value explicitly, one recommendation is below: + # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size + # 4096 = 256 * 16 + # moe_max_num_tokens: 4096 + load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml + tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -18,18 +20,20 @@ enable_attention_dp: true max_batch_size: 256 max_num_tokens: 256 max_seq_len: 8448 + kv_cache_config: - free_gpu_memory_fraction: 0.7 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -kv_cache_dtype: fp8 + free_gpu_memory_fraction: 0.3 + dtype: fp8 + +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml index a8d1854814..652cf82250 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml @@ -15,8 +15,9 @@ backend: pytorch # WideEP related settings -moe_backend: WideEP -moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml +moe_config: + backend: WIDEEP + load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 @@ -35,25 +36,31 @@ kv_cache_config: # With dp attention enabled: large ISL at high concurrency may need # free_gpu_memory_fraction low to have enough available memory. free_gpu_memory_fraction: 0.30 + dtype: fp8 + # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 disable_overlap_scheduler: false -use_cuda_graph: true -cuda_graph_padding_enabled: true -# NOTE: For larger max batch size, you may want to add larger cuda graph -# batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 +cuda_graph_config: + enable_padding: true + # NOTE: For larger max batch size, you may want to + # add larger cuda graph batch sizes below to match. + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + + print_iter_log: true -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml index 44e439e506..4f7aabe682 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml @@ -15,8 +15,9 @@ backend: pytorch # WideEP related settings -moe_backend: WideEP -moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml +moe_config: + backend: WIDEEP + load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 @@ -29,7 +30,8 @@ max_num_tokens: 8192 max_seq_len: 8192 kv_cache_config: - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.3 + dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed @@ -37,5 +39,6 @@ kv_cache_config: # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 disable_overlap_scheduler: true print_iter_log: true -# NOTE: This dtype must match in both prefill/decode configs -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml index 1bed25ef27..297a01595e 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml @@ -21,31 +21,21 @@ max_batch_size: 256 # Will be investigated in the future with TRTLLM team. max_num_tokens: 1024 max_seq_len: 8448 -autotuner_enabled: false +enable_autotuner: false disable_overlap_scheduler: true # Enable Speculative Decoding in the model engine speculative_config: decoding_type: Eagle max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: false kv_cache_config: free_gpu_memory_fraction: 0.5 enable_block_reuse: false -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 + +cuda_graph_config: + max_batch_size: 8 + diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml index 4b595d2126..0b8d799bfb 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml @@ -21,30 +21,34 @@ max_num_tokens: 512 # 8704 = 8192 ISL + 512 OSL max_seq_len: 8704 disable_overlap_scheduler: true -autotuner_enabled: false +enable_autotuner: false # Enable Speculative Decoding in the model engine speculative_config: decoding_type: Eagle max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: false kv_cache_config: free_gpu_memory_fraction: 0.5 enable_block_reuse: false + dtype: fp8 + +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 print_iter_log: true -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml index 8442e478ba..b05181b226 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml @@ -20,17 +20,20 @@ max_batch_size: 1 max_num_tokens: 8192 max_seq_len: 8192 print_iter_log: true -kv_cache_dtype: fp8 disable_overlap_scheduler: true -autotuner_enabled: false +enable_autotuner: false # Enable Speculative Decoding in the model engine speculative_config: decoding_type: Eagle max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: false kv_cache_config: free_gpu_memory_fraction: 0.5 enable_block_reuse: false + dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml index 56ccf8d07d..cada38087c 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml @@ -24,7 +24,7 @@ disable_overlap_scheduler: true # disable_overlap_scheduler is having acc issue speculative_config: decoding_type: Eagle max_draft_len: 3 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 eagle3_one_model: true kv_cache_config: diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml index 556a1365f5..43f04e2715 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml @@ -26,7 +26,7 @@ disable_overlap_scheduler: true speculative_config: decoding_type: Eagle max_draft_len: 3 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 eagle3_one_model: True kv_cache_config: @@ -38,3 +38,6 @@ cuda_graph_config: max_batch_size: 256 print_iter_log: true + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml index a75d2a6219..1cfc62ab02 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml @@ -26,9 +26,12 @@ disable_overlap_scheduler: true speculative_config: decoding_type: Eagle max_draft_len: 3 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 eagle3_one_model: True kv_cache_config: free_gpu_memory_fraction: 0.5 enable_block_reuse: false + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/prefill.yaml b/components/backends/trtllm/engine_configs/prefill.yaml index 5dee9e653d..265d1f9289 100644 --- a/components/backends/trtllm/engine_configs/prefill.yaml +++ b/components/backends/trtllm/engine_configs/prefill.yaml @@ -16,13 +16,15 @@ tensor_parallel_size: 1 moe_expert_parallel_size: 1 enable_attention_dp: false max_num_tokens: 8192 -max_batch_size: 16 trust_remote_code: true backend: pytorch enable_chunked_prefill: true # Overlap scheduler not currently supported in prefill only workers. disable_overlap_scheduler: true -use_cuda_graph: false - +cuda_graph_config: + max_batch_size: 16 kv_cache_config: free_gpu_memory_fraction: 0.95 + +cache_transceiver_config: + backend: default \ No newline at end of file diff --git a/components/backends/trtllm/src/dynamo/trtllm/main.py b/components/backends/trtllm/src/dynamo/trtllm/main.py index f6988fd34c..144f780849 100644 --- a/components/backends/trtllm/src/dynamo/trtllm/main.py +++ b/components/backends/trtllm/src/dynamo/trtllm/main.py @@ -101,8 +101,10 @@ async def init(runtime: DistributedRuntime, config: Config): kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE else: kv_cache_config = arg_map["kv_cache_config"] - if not kv_cache_config.event_buffer_max_size: - kv_cache_config.event_buffer_max_size = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE + if "event_buffer_max_size" not in kv_cache_config: + kv_cache_config[ + "event_buffer_max_size" + ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE arg_map["kv_cache_config"] = kv_cache_config # Only pytorch backend is supported for now to publish events and metrics. diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md index f20b9bb9d0..986fc32337 100644 --- a/components/backends/vllm/README.md +++ b/components/backends/vllm/README.md @@ -35,12 +35,12 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) | Feature | vLLM | Notes | |---------|------|-------| -| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | ✅ | | -| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP | -| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | ✅ | | -| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | ✅ | | -| [**Load Based Planner**](../../docs/architecture/load_planner.md) | 🚧 | WIP | -| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | 🚧 | WIP | +| [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | ✅ | | +| [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP | +| [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | ✅ | | +| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | ✅ | | +| [**Load Based Planner**](../../../docs/architecture/load_planner.md) | 🚧 | WIP | +| [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | 🚧 | WIP | ### Large Scale P/D and WideEP Features @@ -152,73 +152,7 @@ Below we provide a selected list of advanced deployments. Please open up an issu ### Kubernetes Deployment -For Kubernetes deployment, YAML manifests are provided in the `deploy/` directory. These define DynamoGraphDeployment resources for various configurations: - -- `agg.yaml` - Aggregated serving -- `agg_router.yaml` - Aggregated serving with KV routing -- `disagg.yaml` - Disaggregated serving -- `disagg_router.yaml` - Disaggregated serving with KV routing -- `disagg_planner.yaml` - Disaggregated serving with [SLA Planner](../../../docs/architecture/sla_planner.md). See [SLA Planner Deployment Guide](../../../docs/guides/dynamo_deploy/sla_planner_deployment.md) for more details. - -#### Prerequisites - -- **Dynamo Cloud**: Follow the [Quickstart Guide](../../../docs/guides/dynamo_deploy/quickstart.md) to deploy Dynamo Cloud first. - -- **Container Images**: We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts). If you'd prefer to use your own registry, build and push your own image: - ```bash - ./container/build.sh --framework VLLM - # Tag and push to your container registry - # Update the image references in the YAML files - ``` - -- **Pre-Deployment Profiling (if Using SLA Planner)**: Follow the [pre-deployment profiling guide](../../../docs/architecture/pre_deployment_profiling.md) to run pre-deployment profiling. The results will be saved to the `profiling-pvc` PVC and queried by the SLA Planner. - -- **Port Forwarding**: After deployment, forward the frontend service to access the API: - ```bash - kubectl port-forward deployment/vllm-v1-disagg-frontend- 8080:8000 - ``` - -#### Deploy to Kubernetes - -Example with disagg: -Export the NAMESPACE you used in your Dynamo Cloud Installation. - -```bash -cd dynamo -cd components/backends/vllm/deploy -kubectl apply -f disagg.yaml -n $NAMESPACE -``` - -To change `DYN_LOG` level, edit the yaml file by adding - -```yaml -... -spec: - envs: - - name: DYN_LOG - value: "debug" # or other log levels - ... -``` - -### Testing the Deployment - -Send a test request to verify your deployment: - -```bash -curl localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Qwen/Qwen3-0.6B", - "messages": [ - { - "role": "user", - "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." - } - ], - "stream": false, - "max_tokens": 30 - }' -``` +For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [vLLM Kubernetes Deployment Guide](deploy/README.md) ## Configuration @@ -235,7 +169,7 @@ The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html ## Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend. +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. diff --git a/components/backends/vllm/deploy/README.md b/components/backends/vllm/deploy/README.md new file mode 100644 index 0000000000..cb3d442836 --- /dev/null +++ b/components/backends/vllm/deploy/README.md @@ -0,0 +1,255 @@ +# vLLM Kubernetes Deployment Configurations + +This directory contains Kubernetes Custom Resource Definition (CRD) templates for deploying vLLM inference graphs using the **DynamoGraphDeployment** resource. + +## Available Deployment Patterns + +### 1. **Aggregated Deployment** (`agg.yaml`) +Basic deployment pattern with frontend and a single decode worker. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server (with kv router mode disabled) +- `VLLMDecodeWorker`: Single worker handling both prefill and decode + +### 2. **Aggregated Router Deployment** (`agg_router.yaml`) +Enhanced aggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server (with kv router mode enabled) +- `VLLMDecodeWorker`: Single worker handling both prefill and decode + +### 3. **Disaggregated Deployment** (`disagg.yaml`) +High-performance deployment with separated prefill and decode workers. + +**Architecture:** +- `Frontend`: HTTP API server coordinating between workers +- `VLLMDecodeWorker`: Specialized decode-only worker +- `VLLMPrefillWorker`: Specialized prefill-only worker (`--is-prefill-worker`) +- Communication via NIXL transfer backend + +### 4. **Disaggregated Router Deployment** (`disagg_router.yaml`) +Advanced disaggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: HTTP API server with KV-aware routing +- `VLLMDecodeWorker`: Specialized decode-only worker +- `VLLMPrefillWorker`: Specialized prefill-only worker (`--is-prefill-worker`) + +## CRD Structure + +All templates use the **DynamoGraphDeployment** CRD: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: +spec: + services: + : + # Service configuration +``` + +### Key Configuration Options + +**Resource Management:** +```yaml +resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" +``` + +**Container Configuration:** +```yaml +extraPodSpec: + mainContainer: + image: my-registry/vllm-runtime:my-tag + workingDir: /workspace/components/backends/vllm + args: + - "python3" + - "-m" + - "dynamo.vllm" + # Model-specific arguments +``` + +## Prerequisites + +Before using these templates, ensure you have: + +1. **Dynamo Cloud Platform installed** - See [Quickstart Guide](../../../../docs/guides/dynamo_deploy/quickstart.md) +2. **Kubernetes cluster with GPU support** +3. **Container registry access** for vLLM runtime images +4. **HuggingFace token secret** (referenced as `envFromSecret: hf-token-secret`) + +### Container Images + +We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts). If you'd prefer to use your own registry, build and push your own image: + +```bash +./container/build.sh --framework VLLM +# Tag and push to your container registry +# Update the image references in the YAML files +``` + +### Pre-Deployment Profiling (SLA Planner Only) + +If using the SLA Planner deployment (`disagg_planner.yaml`), follow the [pre-deployment profiling guide](../../../../docs/architecture/pre_deployment_profiling.md) to run pre-deployment profiling. The results will be saved to the `profiling-pvc` PVC and queried by the SLA Planner. + +## Usage + +### 1. Choose Your Template +Select the deployment pattern that matches your requirements: +- Use `agg.yaml` for simple testing +- Use `agg_router.yaml` for production with load balancing +- Use `disagg.yaml` for maximum performance +- Use `disagg_router.yaml` for high-performance with KV cache routing +- Use `disagg_planner.yaml` for SLA-optimized performance + +### 2. Customize Configuration +Edit the template to match your environment: + +```yaml +# Update image registry and tag +image: your-registry/vllm-runtime:your-tag + +# Configure your model +args: + - "--model" + - "your-org/your-model" +``` + +### 3. Deploy + +Use the following command to deploy the deployment file. + +First, create a secret for the HuggingFace token. +```bash +export HF_TOKEN=your_hf_token +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + +Then, deploy the model using the deployment file. + +Export the NAMESPACE you used in your Dynamo Cloud Installation. + +```bash +cd /components/backends/vllm/deploy +export DEPLOYMENT_FILE=agg.yaml + +kubectl apply -f $DEPLOYMENT_FILE -n $NAMESPACE +``` + +### 4. Using Custom Dynamo Frameworks Image for vLLM + +To use a custom dynamo frameworks image for vLLM, you can update the deployment file using yq: + +```bash +export DEPLOYMENT_FILE=agg.yaml +export FRAMEWORK_RUNTIME_IMAGE= + +yq '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE > $DEPLOYMENT_FILE.generated +kubectl apply -f $DEPLOYMENT_FILE.generated -n $NAMESPACE +``` + +### 5. Port Forwarding + +After deployment, forward the frontend service to access the API: + +```bash +kubectl port-forward deployment/vllm-v1-disagg-frontend- 8000:8000 +``` + +## Configuration Options + +### Environment Variables + +To change `DYN_LOG` level, edit the yaml file by adding: + +```yaml +... +spec: + envs: + - name: DYN_LOG + value: "debug" # or other log levels + ... +``` + +### vLLM Worker Configuration + +vLLM workers are configured through command-line arguments. Key parameters include: + +- `--endpoint`: Dynamo endpoint in format `dyn://namespace.component.endpoint` +- `--model`: Model to serve (e.g., `Qwen/Qwen3-0.6B`) +- `--is-prefill-worker`: Enable prefill-only mode for disaggregated serving +- `--metrics-endpoint-port`: Port for publishing KV metrics to Dynamo + +See the [vLLM CLI documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html?h=serve+arg) for the full list of configuration options. + +## Testing the Deployment + +Send a test request to verify your deployment: + +```bash +curl localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [ + { + "role": "user", + "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." + } + ], + "stream": false, + "max_tokens": 30 + }' +``` + +## Model Configuration + +All templates use **Qwen/Qwen3-0.6B** as the default model, but you can use any vLLM-supported LLM model and configuration arguments. + +## Monitoring and Health + +- **Frontend health endpoint**: `http://:8000/health` +- **Liveness probes**: Check process health regularly +- **KV metrics**: Published via metrics endpoint port + +## Request Migration + +You can enable [request migration](../../../../docs/architecture/request_migration.md) to handle worker failures gracefully by adding the migration limit argument to worker configurations: + +```yaml +args: + - "--migration-limit" + - "3" +``` + +## Further Reading + +- **Deployment Guide**: [Creating Kubernetes Deployments](../../../../docs/guides/dynamo_deploy/create_deployment.md) +- **Quickstart**: [Deployment Quickstart](../../../../docs/guides/dynamo_deploy/quickstart.md) +- **Platform Setup**: [Dynamo Cloud Installation](../../../../docs/guides/dynamo_deploy/dynamo_cloud.md) +- **SLA Planner**: [SLA Planner Deployment Guide](../../../../docs/guides/dynamo_deploy/sla_planner_deployment.md) +- **Examples**: [Deployment Examples](../../../../docs/examples/README.md) +- **Architecture Docs**: [Disaggregated Serving](../../../../docs/architecture/disagg_serving.md), [KV-Aware Routing](../../../../docs/architecture/kv_cache_routing.md) + +## Troubleshooting + +Common issues and solutions: + +1. **Pod fails to start**: Check image registry access and HuggingFace token secret +2. **GPU not allocated**: Verify cluster has GPU nodes and proper resource limits +3. **Health check failures**: Review model loading logs and increase `initialDelaySeconds` +4. **Out of memory**: Increase memory limits or reduce model batch size +5. **Port forwarding issues**: Ensure correct pod UUID in port-forward command + +For additional support, refer to the [deployment troubleshooting guide](../../../../docs/guides/dynamo_deploy/quickstart.md#troubleshooting). diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py index b86649f06b..889405f6af 100644 --- a/components/backends/vllm/src/dynamo/vllm/args.py +++ b/components/backends/vllm/src/dynamo/vllm/args.py @@ -2,13 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 -import asyncio -import json import logging import os -import socket import sys -import time from typing import Optional from vllm.config import KVTransferConfig @@ -16,9 +12,20 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.utils import FlexibleArgumentParser +from .ports import ( + DEFAULT_DYNAMO_PORT_MAX, + DEFAULT_DYNAMO_PORT_MIN, + DynamoPortRange, + EtcdContext, + PortAllocationRequest, + PortMetadata, + allocate_and_reserve_port, + allocate_and_reserve_port_block, + get_host_ip, +) + logger = logging.getLogger(__name__) -# Only used if you run it manually from the command line DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate" DEFAULT_MODEL = "Qwen/Qwen3-0.6B" @@ -34,6 +41,7 @@ class Config: migration_limit: int = 0 kv_port: Optional[int] = None side_channel_port: Optional[int] = None + port_range: DynamoPortRange # mirror vLLM model: str @@ -64,6 +72,18 @@ def parse_args() -> Config: default=0, help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.", ) + parser.add_argument( + "--dynamo-port-min", + type=int, + default=DEFAULT_DYNAMO_PORT_MIN, + help=f"Minimum port number for Dynamo services (default: {DEFAULT_DYNAMO_PORT_MIN}). Must be in registered ports range (1024-49151).", + ) + parser.add_argument( + "--dynamo-port-max", + type=int, + default=DEFAULT_DYNAMO_PORT_MAX, + help=f"Maximum port number for Dynamo services (default: {DEFAULT_DYNAMO_PORT_MAX}). Must be in registered ports range (1024-49151).", + ) parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() @@ -110,6 +130,9 @@ def parse_args() -> Config: config.engine_args = engine_args config.is_prefill_worker = args.is_prefill_worker config.migration_limit = args.migration_limit + config.port_range = DynamoPortRange( + min=args.dynamo_port_min, max=args.dynamo_port_max + ) if config.engine_args.block_size is None: config.engine_args.block_size = 16 @@ -120,106 +143,66 @@ def parse_args() -> Config: return config -async def allocate_and_reserve_port( - namespace, - etcd_client, - worker_id: str, - reason: str, - max_attempts: int = 100, -) -> int: - """ - Get an OS-assigned port and atomically reserve it in ETCD. - Retries until successful or max_attempts reached. - - Args: - max_attempts: Maximum number of ports to try (default: 100) - - Raises: - RuntimeError: If unable to reserve a port within max_attempts - OSError: If unable to create sockets (system resource issues) - """ - - node_name = socket.gethostname() - try: - node_ip = socket.gethostbyname(node_name) - except socket.gaierror: - # If hostname cannot be resolved, fall back to localhost - logger.warning( - f"Hostname '{node_name}' cannot be resolved, falling back to '127.0.0.1'" - ) - node_ip = "127.0.0.1" - - for attempt in range(1, max_attempts + 1): - # Hold socket open just long enough to reserve in ETCD - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - sock.bind(("", 0)) - port = sock.getsockname()[1] - - # Reserve in ETCD while holding the socket - key = f"dyn://{namespace}/ports/{node_ip}/{port}" - value = { - "worker_id": worker_id, - "reason": reason, - "reserved_at": time.time(), - "pid": os.getpid(), - } - - try: - await etcd_client.kv_create( - key=key, - value=json.dumps(value).encode(), - lease_id=etcd_client.primary_lease_id(), - ) - logger.debug(f"Reserved OS-assigned port {port} for {worker_id}") - return port - - except Exception as e: - logger.debug( - f"Port {port} on {node_name} was already reserved (attempt {attempt}): {e}" - ) - - if attempt < max_attempts: - await asyncio.sleep(0.01) - - raise RuntimeError( - f"Failed to allocate and reserve a port after {max_attempts} attempts" - ) - - async def configure_ports_with_etcd(config: Config, etcd_client): """Configure all settings that require ETCD, including port allocation and vLLM overrides.""" - # First, allocate ports + etcd_context = EtcdContext(client=etcd_client, namespace=config.namespace) + dp_rank = config.engine_args.data_parallel_rank or 0 worker_id = f"vllm-{config.component}-dp{dp_rank}" # Allocate KV events port - kv_port = await allocate_and_reserve_port( - namespace=config.namespace, - etcd_client=etcd_client, - worker_id=f"{worker_id}", - reason="zmq_kv_event_port", + if config.engine_args.enable_prefix_caching: + kv_metadata = PortMetadata(worker_id=worker_id, reason="zmq_kv_event_port") + kv_port = await allocate_and_reserve_port( + etcd_context=etcd_context, + metadata=kv_metadata, + port_range=config.port_range, + ) + config.kv_port = kv_port + logger.info(f"Allocated ZMQ KV events port: {kv_port} (worker_id={worker_id})") + + # Allocate side channel ports + # https://github.com/vllm-project/vllm/blob/releases/v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py#L372 + # NIXL calculates ports as: base_port + (dp_rank * tp_size) + tp_rank + # For dp_rank, we need to reserve tp_size consecutive ports + tp_size = config.engine_args.tensor_parallel_size or 1 + + # The first port for this dp_rank will be at: base_port + (dp_rank * tp_size) + # We need to allocate tp_size consecutive ports starting from there + nixl_metadata = PortMetadata(worker_id=worker_id, reason="nixl_side_channel_port") + nixl_request = PortAllocationRequest( + etcd_context=etcd_context, + metadata=nixl_metadata, + port_range=config.port_range, + block_size=tp_size, ) + allocated_ports = await allocate_and_reserve_port_block(nixl_request) + first_port_for_dp_rank = allocated_ports[0] + + # Calculate the base port that NIXL expects + # base_port = first_port_for_dp_rank - (dp_rank * tp_size) + nixl_offset = dp_rank * tp_size + base_side_channel_port = first_port_for_dp_rank - nixl_offset + + if base_side_channel_port < 0: + raise ValueError( + f"NIXL base port calculation resulted in negative port: " + f"first_allocated_port={first_port_for_dp_rank}, offset={nixl_offset}, " + f"base_port={base_side_channel_port}. Current range: {config.port_range.min}-{config.port_range.max}. " + f"Consider using a higher port range." + ) - # Allocate side channel port - side_channel_port = await allocate_and_reserve_port( - namespace=config.namespace, - etcd_client=etcd_client, - worker_id=f"{worker_id}", - reason="nixl_side_channel_port", - ) + config.side_channel_port = base_side_channel_port - # Update config with allocated ports - config.kv_port = kv_port - config.side_channel_port = side_channel_port + logger.info( + f"Allocated NIXL side channel ports: base={base_side_channel_port}, " + f"allocated_ports={allocated_ports} (worker_id={worker_id}, dp_rank={dp_rank}, tp_size={tp_size})" + ) def overwrite_args(config): """Set vLLM defaults for Dynamo.""" - assert ( - config.kv_port is not None - ), "Must set the kv_port, use configure_ports_with_etcd" assert ( config.side_channel_port is not None ), "Must set the kv_port, use configure_ports_with_etcd" @@ -263,36 +246,6 @@ def overwrite_args(config): raise ValueError(f"{key} not found in AsyncEngineArgs from vLLM.") -def get_host_ip() -> str: - """Get the IP address of the host. - This is needed for the side channel to work in multi-node deployments. - """ - try: - host_name = socket.gethostname() - except socket.error as e: - logger.warning(f"Failed to get hostname: {e}, falling back to '127.0.0.1'") - return "127.0.0.1" - else: - try: - # Get the IP address of the hostname - this is needed for the side channel to work in multi-node deployments - host_ip = socket.gethostbyname(host_name) - # Test if the IP is actually usable by binding to it - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_socket: - test_socket.bind((host_ip, 0)) - return host_ip - except socket.gaierror as e: - logger.warning( - f"Hostname '{host_name}' cannot be resolved: {e}, falling back to '127.0.0.1'" - ) - return "127.0.0.1" - except socket.error as e: - # If hostname is not usable for binding, fall back to localhost - logger.warning( - f"Hostname '{host_name}' is not usable for binding: {e}, falling back to '127.0.0.1'" - ) - return "127.0.0.1" - - def set_side_channel_host_and_port(config: Config): """vLLM V1 NixlConnector creates a side channel to exchange metadata with other NIXL connectors. This sets the port number for the side channel. diff --git a/components/backends/vllm/src/dynamo/vllm/ports.py b/components/backends/vllm/src/dynamo/vllm/ports.py new file mode 100644 index 0000000000..19fdde7279 --- /dev/null +++ b/components/backends/vllm/src/dynamo/vllm/ports.py @@ -0,0 +1,290 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Port allocation and management utilities for Dynamo services.""" + +import asyncio +import json +import logging +import os +import random +import socket +import time +from contextlib import contextmanager +from dataclasses import dataclass, field + +from dynamo.runtime import EtcdKvCache + +logger = logging.getLogger(__name__) + +# Default port range in the registered ports section +DEFAULT_DYNAMO_PORT_MIN = 20000 +DEFAULT_DYNAMO_PORT_MAX = 30000 + + +@dataclass +class DynamoPortRange: + """Port range configuration for Dynamo services""" + + min: int + max: int + + def __post_init__(self): + if self.min < 1024 or self.max > 49151: + raise ValueError( + f"Port range {self.min}-{self.max} is outside registered ports range (1024-49151)" + ) + if self.min >= self.max: + raise ValueError( + f"Invalid port range: min ({self.min}) must be less than max ({self.max})" + ) + + +@dataclass +class EtcdContext: + """Context for ETCD operations""" + + client: EtcdKvCache # etcd client instance + namespace: str # Namespace for keys (used in key prefix) + + def make_port_key(self, port: int) -> str: + """Generate ETCD key for a port reservation""" + node_ip = get_host_ip() + return f"dyn://{self.namespace}/ports/{node_ip}/{port}" + + +@dataclass +class PortMetadata: + """Metadata to store with port reservations in ETCD""" + + worker_id: str # Worker identifier (e.g., "vllm-backend-dp0") + reason: str # Purpose of the port (e.g., "nixl_side_channel_port") + block_info: dict = field(default_factory=dict) # Optional block allocation info + + def to_etcd_value(self) -> dict: + """Convert to dictionary for ETCD storage""" + value = { + "worker_id": self.worker_id, + "reason": self.reason, + "reserved_at": time.time(), + "pid": os.getpid(), + } + if self.block_info: + value.update(self.block_info) + return value + + +@dataclass +class PortAllocationRequest: + """Parameters for port allocation""" + + etcd_context: EtcdContext + metadata: PortMetadata + port_range: DynamoPortRange + block_size: int = 1 + max_attempts: int = 100 + + +@contextmanager +def hold_ports(ports: int | list[int]): + """Context manager to hold port binding(s). + + Holds socket bindings to ensure exclusive access to ports during reservation. + Can handle a single port or multiple ports. + + Args: + ports: Single port number or list of port numbers to hold + """ + if isinstance(ports, int): + ports = [ports] + + sockets = [] + try: + for port in ports: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.bind(("", port)) + sockets.append(sock) + + yield + + finally: + for sock in sockets: + sock.close() + + +def check_port_available(port: int) -> bool: + """Check if a specific port is available for binding.""" + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("", port)) + return True + except OSError: + return False + + +async def reserve_port_in_etcd( + etcd_context: EtcdContext, + port: int, + metadata: PortMetadata, +) -> None: + """Reserve a single port in ETCD.""" + key = etcd_context.make_port_key(port) + value = metadata.to_etcd_value() + + await etcd_context.client.kv_create( + key=key, + value=json.dumps(value).encode(), + lease_id=etcd_context.client.primary_lease_id(), + ) + + +async def allocate_and_reserve_port_block(request: PortAllocationRequest) -> list[int]: + """ + Allocate a contiguous block of ports from the specified range and atomically reserve them in ETCD. + Returns a list of all allocated ports in order. + + This function uses a context manager to hold port bindings while reserving in ETCD, + preventing race conditions between multiple processes. + + Args: + request: PortAllocationRequest containing all allocation parameters + + Returns: + list[int]: List of all allocated ports in ascending order + + Raises: + RuntimeError: If unable to reserve a port block within max_attempts + OSError: If unable to create sockets (system resource issues) + """ + # Create a list of valid starting ports (must have room for the entire block) + max_start_port = request.port_range.max - request.block_size + 1 + if max_start_port < request.port_range.min: + raise ValueError( + f"Port range {request.port_range.min}-{request.port_range.max} is too small for block size {request.block_size}" + ) + + available_start_ports = list(range(request.port_range.min, max_start_port + 1)) + random.shuffle(available_start_ports) + + actual_max_attempts = min(len(available_start_ports), request.max_attempts) + + for attempt in range(1, actual_max_attempts + 1): + start_port = available_start_ports[attempt - 1] + ports_to_reserve = list(range(start_port, start_port + request.block_size)) + + try: + # Try to bind to all ports in the block atomically + with hold_ports(ports_to_reserve): + logger.debug( + f"Successfully bound to ports {ports_to_reserve}, now reserving in ETCD" + ) + + # We have exclusive access to these ports, now reserve them in ETCD + for i, port in enumerate(ports_to_reserve): + port_metadata = PortMetadata( + worker_id=f"{request.metadata.worker_id}-{i}" + if request.block_size > 1 + else request.metadata.worker_id, + reason=request.metadata.reason, + block_info={ + "block_index": i, + "block_size": request.block_size, + "block_start": start_port, + } + if request.block_size > 1 + else {}, + ) + + await reserve_port_in_etcd( + etcd_context=request.etcd_context, + port=port, + metadata=port_metadata, + ) + + logger.debug( + f"Reserved port block {ports_to_reserve} from range {request.port_range.min}-{request.port_range.max} " + f"for {request.metadata.worker_id} (block_size={request.block_size})" + ) + return ports_to_reserve + + except OSError as e: + logger.debug( + f"Failed to bind to port block starting at {start_port} (attempt {attempt}): {e}" + ) + except Exception as e: + logger.debug( + f"Failed to reserve port block starting at {start_port} in ETCD (attempt {attempt}): {e}" + ) + + if attempt < actual_max_attempts: + await asyncio.sleep(0.01) + + raise RuntimeError( + f"Failed to allocate and reserve a port block of size {request.block_size} from range " + f"{request.port_range.min}-{request.port_range.max} after {actual_max_attempts} attempts" + ) + + +async def allocate_and_reserve_port( + etcd_context: EtcdContext, + metadata: PortMetadata, + port_range: DynamoPortRange, + max_attempts: int = 100, +) -> int: + """ + Allocate a port from the specified range and atomically reserve it in ETCD. + This is a convenience wrapper around allocate_and_reserve_port_block with block_size=1. + + Args: + etcd_context: ETCD context for operations + metadata: Port metadata for ETCD storage + port_range: DynamoPortRange object specifying min and max ports to try + max_attempts: Maximum number of ports to try (default: 100) + + Returns: + int: The allocated port number + + Raises: + RuntimeError: If unable to reserve a port within max_attempts + OSError: If unable to create sockets (system resource issues) + """ + request = PortAllocationRequest( + etcd_context=etcd_context, + metadata=metadata, + port_range=port_range, + block_size=1, + max_attempts=max_attempts, + ) + allocated_ports = await allocate_and_reserve_port_block(request) + return allocated_ports[0] # Return the single allocated port + + +def get_host_ip() -> str: + """Get the IP address of the host. + This is needed for the side channel to work in multi-node deployments. + """ + try: + host_name = socket.gethostname() + except socket.error as e: + logger.warning(f"Failed to get hostname: {e}, falling back to '127.0.0.1'") + return "127.0.0.1" + else: + try: + # Get the IP address of the hostname - this is needed for the side channel to work in multi-node deployments + host_ip = socket.gethostbyname(host_name) + # Test if the IP is actually usable by binding to it + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_socket: + test_socket.bind((host_ip, 0)) + return host_ip + except socket.gaierror as e: + logger.warning( + f"Hostname '{host_name}' cannot be resolved: {e}, falling back to '127.0.0.1'" + ) + return "127.0.0.1" + except socket.error as e: + # If hostname is not usable for binding, fall back to localhost + logger.warning( + f"Hostname '{host_name}' is not usable for binding: {e}, falling back to '127.0.0.1'" + ) + return "127.0.0.1" diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang index 8557684096..fca22f722e 100644 --- a/container/Dockerfile.sglang +++ b/container/Dockerfile.sglang @@ -40,7 +40,7 @@ ARG ARCH ARG ARCH_ALT ARG NIXL_UCX_REF=v1.19.x -ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 +ARG NIXL_REF=0.4.1 ENV NIXL_SRC_DIR=/opt/nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl @@ -378,8 +378,6 @@ RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/la sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \ echo "cat ~/.launch_screen" >> ~/.bashrc -ENV PYTHONPATH=/workspace/dynamo/components/planner/src:/workspace/examples/sglang/utils:$PYTHONPATH - ######################################## ########## Development Image ########### ######################################## @@ -429,8 +427,9 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential python3-dev libnuma-dev \ - # Curl for polling various endpoints. + # jq and curl for polling various endpoints and health checks curl \ + jq \ # For debugging vim \ # Libraries required by UCX to find RDMA devices @@ -446,7 +445,10 @@ RUN apt-get update && \ COPY --from=ci_minimum /workspace/target/release/metrics /usr/local/bin/metrics COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/ COPY --from=base /workspace/wheels/nixl/*.whl wheelhouse/ -RUN uv pip install ai-dynamo[sglang] --find-links wheelhouse + +# Install flashinfer-python pre-release version separately, then install ai-dynamo with sglang support +RUN uv pip install "flashinfer-python==0.2.9rc2" --prerelease=allow && \ + uv pip install "ai-dynamo[sglang]" --find-links wheelhouse # Copy launch banner RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \ @@ -466,7 +468,5 @@ RUN uv pip install /workspace/benchmarks # Copy attribution files COPY ATTRIBUTION* LICENSE /workspace/ -ENV PYTHONPATH=/workspace/examples/sglang/utils:$PYTHONPATH - ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] CMD [] diff --git a/container/Dockerfile.sglang-wideep b/container/Dockerfile.sglang-wideep index 0bbcb3af23..68cea2a559 100644 --- a/container/Dockerfile.sglang-wideep +++ b/container/Dockerfile.sglang-wideep @@ -71,7 +71,7 @@ RUN rm -rf /opt/hpcx/ucx && \ ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH -ARG NIXL_TAG=0.3.1 +ARG NIXL_TAG=0.4.1 RUN git clone https://github.com/ai-dynamo/nixl.git && cd nixl && git checkout ${NIXL_TAG} && pip install --break-system-packages . --config-settings=setup-args="-Ducx_path=/usr/local/ucx" WORKDIR /sgl-workspace @@ -121,7 +121,7 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} RUN cargo build --release RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../.. -RUN pip install --break-system-packages -e . +RUN pip install --break-system-packages . RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \ dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb @@ -152,6 +152,9 @@ RUN cmake --version RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ rapidjson-dev \ + # jq and curl for polling various endpoints and health checks + jq \ + curl \ zlib1g-dev RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \ diff --git a/container/Dockerfile.tensorrt_llm b/container/Dockerfile.tensorrt_llm index 4a6cd167bf..2293b6ffb7 100644 --- a/container/Dockerfile.tensorrt_llm +++ b/container/Dockerfile.tensorrt_llm @@ -45,7 +45,7 @@ ARG ARCH ARG ARCH_ALT ARG NIXL_UCX_REF=v1.19.x -ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 +ARG NIXL_REF=0.4.1 ENV NIXL_SRC_DIR=/opt/nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl @@ -150,6 +150,7 @@ COPY --from=trtllm_wheel . /trtllm_wheel/ # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel # because there might be mismatched versions of TensorRT between the NGC PyTorch # and the TRTLLM wheel. +# Locking triton version to 3.3.1 as 3.4.0 breaks tensorrt-llm 1.0.0rc4 RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \ pip uninstall -y tensorrt && \ if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \ @@ -157,14 +158,19 @@ RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \ WHEEL_FILE=$(find /trtllm_wheel -name "*.whl" | head -n 1); \ if [ -n "$WHEEL_FILE" ]; then \ pip install "$WHEEL_FILE"; \ + if [ "$ARCH" = "amd64" ]; then \ + pip install "triton==3.3.1"; \ + fi; \ else \ echo "No wheel file found in /trtllm_wheel directory."; \ exit 1; \ fi; \ else \ - # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI - pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" \ - "${TENSORRTLLM_PIP_WHEEL}" ; \ + # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI + pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \ + if [ "$ARCH" = "amd64" ]; then \ + pip install "triton==3.3.1"; \ + fi; \ fi # Install test dependencies @@ -367,12 +373,25 @@ CMD [] FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime +WORKDIR /workspace + ARG ARCH_ALT -WORKDIR /workspace ENV DYNAMO_HOME=/workspace ENV VIRTUAL_ENV=/opt/dynamo/venv ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" +ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl +ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu +ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins +ENV LD_LIBRARY_PATH=\ +$NIXL_LIB_DIR:\ +$NIXL_PLUGIN_DIR:\ +/usr/local/ucx/lib:\ +/usr/local/ucx/lib/ucx:\ +/opt/hpcx/ompi/lib:\ +$LD_LIBRARY_PATH +ENV PATH=/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH +ENV OPAL_PREFIX=/opt/hpcx/ompi # Install apt dependencies # openssh-client, openssh-server are needed for OpenMPI @@ -380,7 +399,8 @@ RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential \ python3-dev \ - # Curl for polling various endpoints. + # jq and curl for polling various endpoints and health checks + jq \ curl \ # For debugging vim \ @@ -466,21 +486,6 @@ COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn /usr/local/ COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/ -# Setup environment variables -ARG ARCH_ALT -ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl -ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu -ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins - -ENV LD_LIBRARY_PATH=\ -$NIXL_LIB_DIR:\ -$NIXL_PLUGIN_DIR:\ -/usr/local/ucx/lib:\ -/usr/local/ucx/lib/ucx:\ -/opt/hpcx/ompi/lib:\ -$LD_LIBRARY_PATH -ENV PATH=/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH -ENV OPAL_PREFIX=/opt/hpcx/ompi # Install TensorRT-LLM (same as in build stage) ARG HAS_TRTLLM_CONTEXT=0 @@ -489,16 +494,19 @@ ARG TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" # Copy Dynamo wheels into wheelhouse # Copy metrics binary from wheel_builder image, not part of ai-dynamo wheel -COPY --from=dev /workspace/wheels/nixl/*.whl wheelhouse/ -COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/ +COPY --from=dev /workspace/wheels/nixl/*.whl /workspace/wheelhouse/ +COPY --from=wheel_builder /workspace/dist/*.whl /workspace/wheelhouse/ COPY --from=dev /workspace/target/release/metrics /usr/local/bin/metrics # NOTE: If a package (tensorrt_llm) exists on both --index-url and --extra-index-url, # uv will prioritize the --extra-index-url, unless --index-strategy unsafe-best-match # is also specified. So set the configurable index as a --extra-index-url for prioritization. -RUN uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" \ - "${TENSORRTLLM_PIP_WHEEL}" && \ - uv pip install ai-dynamo nixl --find-links wheelhouse +# locking triton version to 3.3.1 as 3.4.0 breaks tensorrt-llm 1.0.0rc4 +RUN uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" && \ + if [ "$ARCH" = "amd64" ]; then \ + pip install "triton==3.3.1"; \ + fi; \ + uv pip install ai-dynamo nixl --find-links /workspace/wheelhouse # Setup TRTLLM environment variables, same as in dev image ENV TRTLLM_USE_UCX_KVCACHE=1 diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 1ad4d253e7..52fd10708f 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -81,7 +81,7 @@ RUN apt-get update -y && \ rm -rf /var/lib/apt/lists/* ARG NIXL_UCX_REF=v1.19.x -ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 +ARG NIXL_REF=0.4.1 ENV NIXL_SRC_DIR=/opt/nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl @@ -437,7 +437,8 @@ RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential \ python3-dev \ - # Curl for polling various endpoints. + # jq and curl for polling various endpoints and health checks + jq \ curl \ # For debugging vim \ diff --git a/container/build.sh b/container/build.sh index 03f79588c3..b5fe731d79 100755 --- a/container/build.sh +++ b/container/build.sh @@ -88,7 +88,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" # TensorRT-LLM commit to use for building the trtllm wheel if not provided. # Important Note: This commit is not used in our CI pipeline. See the CI # variables to learn how to run a pipeline with a specific commit. -DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3" +DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="69e9f6d48944b2ae0124ff57aa59340aa4dfae15" TRTLLM_COMMIT="" TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" @@ -96,7 +96,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package. # Need to update the Dockerfile.tensorrt_llm to use the ai-dynamo[trtllm] package. -DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc0" +DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc4" TENSORRTLLM_PIP_WHEEL="" @@ -113,7 +113,7 @@ NONE_BASE_IMAGE_TAG="24.04" SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" SGLANG_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" -NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 +NIXL_REF=0.4.1 NIXL_UCX_EFA_REF=7ec95b95e524a87e81cac92f5ca8523e3966b16b NO_CACHE="" @@ -389,8 +389,6 @@ ARCH="amd64" if [[ "$PLATFORM" == *"linux/arm64"* ]]; then ARCH="arm64" BUILD_ARGS+=" --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64 " - # TEMP: Pin to nixl 0.3.1 for arm build, since 0.4.0 fails - NIXL_REF=3503658e71143b56f9d5b1b440d84a94b9c41af8 fi # Update DOCKERFILE if framework is VLLM diff --git a/deploy/cloud/README.md b/deploy/cloud/README.md index 0f4ad5635e..dfbb10f392 100644 --- a/deploy/cloud/README.md +++ b/deploy/cloud/README.md @@ -21,6 +21,6 @@ This directory contains the infrastructure components required for the Dynamo cl For detailed documentation on setting up and using the Dynamo Cloud Platform, please refer to: - [Dynamo Cloud Platform Guide](../../docs/guides/dynamo_deploy/dynamo_cloud.md) -- [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md) +- [Operator Deployment Guide](../../docs/guides/dynamo_deploy/dynamo_operator.md) -For a quick start example, see [examples/hello_world/README.md#deploying-to-kubernetes-using-dynamo-cloud-and-dynamo-deploy-cli](../../examples/hello_world/README.md#deploying-to-kubernetes-using-dynamo-cloud-and-dynamo-deploy-cli) \ No newline at end of file +For a quick start example, see [examples/runtime/hello_world/README.md#deployment-to-kubernetes](../../examples/runtime/hello_world/README.md#deployment-to-kubernetes) \ No newline at end of file diff --git a/deploy/cloud/helm/deploy.sh b/deploy/cloud/helm/deploy.sh index 1866be0481..e9b9225c81 100755 --- a/deploy/cloud/helm/deploy.sh +++ b/deploy/cloud/helm/deploy.sh @@ -49,7 +49,6 @@ export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}" export INGRESS_CLASS="${INGRESS_CLASS:=nginx}" export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}" export ENABLE_LWS="${ENABLE_LWS:=false}" -export ENABLE_GROVE="${ENABLE_GROVE:=false}" # Add command line options INTERACTIVE=false @@ -165,7 +164,7 @@ echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX" echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS" echo "INSTALL_CRDS: $INSTALL_CRDS" -envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${ENABLE_GROVE}' < dynamo-platform-values.yaml > generated-values.yaml +envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}' < dynamo-platform-values.yaml > generated-values.yaml echo "generated file contents:" cat generated-values.yaml diff --git a/deploy/cloud/helm/dynamo-platform-values.yaml b/deploy/cloud/helm/dynamo-platform-values.yaml index 269a5962c1..4ead5fd98b 100644 --- a/deploy/cloud/helm/dynamo-platform-values.yaml +++ b/deploy/cloud/helm/dynamo-platform-values.yaml @@ -24,7 +24,6 @@ dynamo-operator: dynamo: enableLWS: ${ENABLE_LWS} - enableGrove: ${ENABLE_GROVE} ingress: enabled: ${INGRESS_ENABLED} className: ${INGRESS_CLASS} diff --git a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml index bb570f2e78..6c6fe1abba 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml @@ -100,8 +100,8 @@ spec: {{- if .Values.dynamo.enableLWS }} - --enable-lws {{- end }} - {{- if .Values.dynamo.enableGrove }} - - --enable-grove + {{- if .Values.dynamo.groveTerminationDelay }} + - --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }} {{- end }} command: - /manager diff --git a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml index f245ce4b68..bf084e5a1b 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml @@ -116,7 +116,6 @@ rules: - patch - update - watch -{{- if .Values.dynamo.enableGrove }} - apiGroups: - grove.io resources: @@ -129,7 +128,6 @@ rules: - patch - update - watch -{{- end }} - apiGroups: - apps resources: diff --git a/deploy/cloud/helm/platform/components/operator/values.yaml b/deploy/cloud/helm/platform/components/operator/values.yaml index 086677fcb0..540d23a768 100644 --- a/deploy/cloud/helm/platform/components/operator/values.yaml +++ b/deploy/cloud/helm/platform/components/operator/values.yaml @@ -82,7 +82,7 @@ dynamo: annotations: {} enableLWS: false - enableGrove: false + groveTerminationDelay: 15m internalImages: debugger: python:3.12-slim diff --git a/deploy/cloud/helm/platform/values.yaml b/deploy/cloud/helm/platform/values.yaml index baec3d0e8b..c9b3b9924a 100644 --- a/deploy/cloud/helm/platform/values.yaml +++ b/deploy/cloud/helm/platform/values.yaml @@ -34,7 +34,7 @@ dynamo-operator: imagePullSecrets: [] dynamo: enableLWS: false - enableGrove: false + groveTerminationDelay: 15m internalImages: debugger: python:3.12-slim enableRestrictedSecurityContext: false diff --git a/deploy/cloud/operator/cmd/main.go b/deploy/cloud/operator/cmd/main.go index 845d59afb6..ac8a142caa 100644 --- a/deploy/cloud/operator/cmd/main.go +++ b/deploy/cloud/operator/cmd/main.go @@ -30,6 +30,7 @@ import ( // to ensure that exec-entrypoint and run can make use of them. clientv3 "go.etcd.io/etcd/client/v3" corev1 "k8s.io/api/core/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" _ "k8s.io/client-go/plugin/pkg/client/auth" @@ -50,6 +51,7 @@ import ( grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller" commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd" @@ -73,6 +75,10 @@ func init() { utilruntime.Must(volcanoscheme.AddToScheme(scheme)) utilruntime.Must(grovev1alpha1.AddToScheme(scheme)) + + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) + + utilruntime.Must(istioclientsetscheme.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -92,7 +98,7 @@ func main() { var ingressControllerTLSSecretName string var ingressHostSuffix string var enableLWS bool - var enableGrove bool + var groveTerminationDelay time.Duration flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, @@ -120,22 +126,23 @@ func main() { "The suffix to use for the ingress host") flag.BoolVar(&enableLWS, "enable-lws", false, "If set, enable leader worker set") - flag.BoolVar(&enableGrove, "enable-grove", false, - "If set, enable grove") + flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay, + "The termination delay for Grove PodGangSets") opts := zap.Options{ Development: true, } opts.BindFlags(flag.CommandLine) flag.Parse() - utilruntime.Must(istioclientsetscheme.AddToScheme(scheme)) - ctrlConfig := commonController.Config{ RestrictedNamespace: restrictedNamespace, EnableLWS: enableLWS, - EnableGrove: enableGrove, - EtcdAddress: etcdAddr, - NatsAddress: natsAddr, + Grove: commonController.GroveConfig{ + Enabled: false, // Will be set after Grove discovery + TerminationDelay: groveTerminationDelay, + }, + EtcdAddress: etcdAddr, + NatsAddress: natsAddr, IngressConfig: commonController.IngressConfig{ VirtualServiceGateway: istioVirtualServiceGateway, IngressControllerClassName: ingressControllerClassName, @@ -201,6 +208,11 @@ func main() { os.Exit(1) } + // Detect Grove availability using discovery client + setupLog.Info("Detecting Grove availability...") + groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr) + ctrlConfig.Grove.Enabled = groveEnabled + // Create etcd client cli, err := clientv3.New(clientv3.Config{ Endpoints: []string{etcdAddr}, diff --git a/deploy/cloud/operator/internal/consts/consts.go b/deploy/cloud/operator/internal/consts/consts.go index a744d45249..599bdf51f3 100644 --- a/deploy/cloud/operator/internal/consts/consts.go +++ b/deploy/cloud/operator/internal/consts/consts.go @@ -1,5 +1,7 @@ package consts +import "time" + const ( HPACPUDefaultAverageUtilization = 80 @@ -37,4 +39,6 @@ const ( PlannerServiceAccountName = "planner-serviceaccount" DefaultIngressSuffix = "local" + + DefaultGroveTerminationDelay = 15 * time.Minute ) diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index cc669be47e..36601fff97 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -144,7 +144,7 @@ type Resource interface { func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) { logger := log.FromContext(ctx) - if r.Config.EnableGrove { + if r.Config.Grove.Enabled { // check if explicit opt out of grove if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse { logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation") @@ -308,7 +308,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err GenericFunc: func(ge event.GenericEvent) bool { return true }, })). WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)) - if r.Config.EnableGrove { + if r.Config.Grove.Enabled { ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{ // ignore creation cause we don't want to be called again after we create the pod gang set CreateFunc: func(ce event.CreateEvent) bool { return false }, diff --git a/deploy/cloud/operator/internal/controller_common/predicate.go b/deploy/cloud/operator/internal/controller_common/predicate.go index 539fde2714..5ad7724cfb 100644 --- a/deploy/cloud/operator/internal/controller_common/predicate.go +++ b/deploy/cloud/operator/internal/controller_common/predicate.go @@ -20,18 +20,28 @@ package controller_common import ( "context" "strings" + "time" "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/client-go/discovery" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" ) +type GroveConfig struct { + // Enabled is automatically determined by checking if Grove CRDs are installed in the cluster + Enabled bool + // TerminationDelay configures the termination delay for Grove PodGangSets + TerminationDelay time.Duration +} + type Config struct { // Enable resources filtering, only the resources belonging to the given namespace will be handled. RestrictedNamespace string EnableLWS bool - EnableGrove bool + Grove GroveConfig EtcdAddress string NatsAddress string IngressConfig IngressConfig @@ -48,6 +58,43 @@ func (i *IngressConfig) UseVirtualService() bool { return i.VirtualServiceGateway != "" } +// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered +// This approach uses the discovery client which is simpler and more reliable +func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool { + logger := log.FromContext(ctx) + + // Use the discovery client to check if Grove API groups are available + cfg := mgr.GetConfig() + if cfg == nil { + logger.Info("Grove detection failed, no discovery client available") + return false + } + + // Try to create a discovery client + discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg) + if err != nil { + logger.Error(err, "Grove detection failed, could not create discovery client") + return false + } + + // Check if grove.io API group is available + apiGroups, err := discoveryClient.ServerGroups() + if err != nil { + logger.Error(err, "Grove detection failed, could not list server groups") + return false + } + + for _, group := range apiGroups.Groups { + if group.Name == "grove.io" { + logger.Info("Grove is available, grove.io API group found") + return true + } + } + + logger.Info("Grove not available, grove.io API group not found") + return false +} + func EphemeralDeploymentEventFilter(config Config) predicate.Predicate { return predicate.NewPredicateFuncs(func(o client.Object) bool { l := log.FromContext(context.Background()) diff --git a/deploy/cloud/operator/internal/dynamo/graph.go b/deploy/cloud/operator/internal/dynamo/graph.go index a0ba1a4ae0..c6c39d4d34 100644 --- a/deploy/cloud/operator/internal/dynamo/graph.go +++ b/deploy/cloud/operator/internal/dynamo/graph.go @@ -167,7 +167,9 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD labels[commonconsts.KubeLabelDynamoNamespace] = dynamoNamespace if component.ComponentType == commonconsts.ComponentTypePlanner { if deployment.Spec.ExtraPodSpec == nil { - deployment.Spec.ExtraPodSpec = &common.ExtraPodSpec{} + deployment.Spec.ExtraPodSpec = &common.ExtraPodSpec{ + PodSpec: &corev1.PodSpec{}, + } } deployment.Spec.ExtraPodSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName } @@ -316,6 +318,9 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn gangSet.Name = dynamoDeployment.Name gangSet.Namespace = dynamoDeployment.Namespace gangSet.Spec.Replicas = 1 + if controllerConfig.Grove.TerminationDelay > 0 { + gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay} + } for componentName, component := range dynamoDeployment.Spec.Services { container := corev1.Container{ Name: "main", diff --git a/deploy/cloud/operator/internal/dynamo/graph_test.go b/deploy/cloud/operator/internal/dynamo/graph_test.go index bfa66c30d3..f6eda2e39e 100644 --- a/deploy/cloud/operator/internal/dynamo/graph_test.go +++ b/deploy/cloud/operator/internal/dynamo/graph_test.go @@ -23,6 +23,7 @@ import ( "reflect" "sort" "testing" + "time" grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common" @@ -1136,6 +1137,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) { controllerConfig: controller_common.Config{ EtcdAddress: "etcd-address", NatsAddress: "nats-address", + Grove: controller_common.GroveConfig{ + TerminationDelay: 15 * time.Minute, + }, }, dynamoDeployment: &v1alpha1.DynamoGraphDeployment{ ObjectMeta: metav1.ObjectMeta{ @@ -1272,6 +1276,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { Spec: grovev1alpha1.PodGangSetSpec{ Replicas: 1, Template: grovev1alpha1.PodGangSetTemplateSpec{ + TerminationDelay: &metav1.Duration{Duration: 15 * time.Minute}, Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{ { Name: "frontend", diff --git a/deploy/inference-gateway/README.md b/deploy/inference-gateway/README.md index 7787d57b64..6f985e3f15 100644 --- a/deploy/inference-gateway/README.md +++ b/deploy/inference-gateway/README.md @@ -18,8 +18,7 @@ Currently, this setup is only kgateway based Inference Gateway. 1. **Install Dynamo Platform** -[See Quickstart Guide](../../../docs/guides/dynamo_deploy/quickstart.md) to install Dynamo Cloud. - +[See Quickstart Guide](../../docs/guides/dynamo_deploy/quickstart.md) to install Dynamo Cloud. 2. **Deploy Inference Gateway** @@ -70,7 +69,17 @@ kubectl get gateway inference-gateway -n my-model # inference-gateway kgateway x.x.x.x True 1m ``` -3. **Install dynamo model and dynamo gaie helm chart** +3. **Deploy model** + +Follow the steps in [model deployment](../../components/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../components/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace. + +Sample commands to deploy model: +```bash +cd /components/backends/vllm/deploy +kubectl apply -f agg.yaml -n my-model +``` + +4. **Install Dynamo GAIE helm chart** The Inference Gateway is configured through the `inference-gateway-resources.yaml` file. diff --git a/deploy/metrics/README.md b/deploy/metrics/README.md index ce3b8e6aef..e23a13263f 100644 --- a/deploy/metrics/README.md +++ b/deploy/metrics/README.md @@ -87,7 +87,7 @@ Grafana is pre-configured with: ## Required Files The following configuration files should be present in this directory: -- [docker-compose.yml](./docker-compose.yml): Defines the Prometheus and Grafana services +- [docker-compose.yml](../docker-compose.yml): Defines the Prometheus and Grafana services - [prometheus.yml](./prometheus.yml): Contains Prometheus scraping configuration - [grafana-datasources.yml](./grafana-datasources.yml): Contains Grafana datasource configuration - [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration diff --git a/docs/API/nixl_connect/connector.md b/docs/API/nixl_connect/connector.md index 7b8b1fa611..99bc81fc5b 100644 --- a/docs/API/nixl_connect/connector.md +++ b/docs/API/nixl_connect/connector.md @@ -28,7 +28,7 @@ The connector provides two methods of moving data between workers: - Preparing local memory to be read by a remote worker. -In both cases, local memory is registered with the NIXL-based RDMA subsystem via the [`Descriptor`](#descriptor) class and provided to the connector. +In both cases, local memory is registered with the NIXL-based RDMA subsystem via the [`Descriptor`](descriptor.md) class and provided to the connector. The connector then configures the RDMA subsystem to expose the memory for the requested operation and returns an operation control object. The operation control object, either a [`ReadableOperation`](readable_operation.md) or a [`WritableOperation`](writable_operation.md), provides RDMA metadata ([RdmaMetadata](rdma_metadata.md)) via its `.metadata()` method, functionality to query the operation's current state, as well as the ability to cancel the operation prior to its completion. diff --git a/docs/architecture/dynamo_flow.md b/docs/architecture/dynamo_flow.md index 32146e1188..a17a7b11ec 100644 --- a/docs/architecture/dynamo_flow.md +++ b/docs/architecture/dynamo_flow.md @@ -17,7 +17,7 @@ limitations under the License. # Dynamo Architecture Flow -This diagram shows the NVIDIA Dynamo disaggregated inference system as implemented in [examples/llm](https://github.com/ai-dynamo/dynamo/tree/main/examples/llm). Color-coded flows indicate different types of operations: +This diagram shows the NVIDIA Dynamo disaggregated inference system as implemented in [examples/llm](https://github.com/ai-dynamo/dynamo/tree/v0.3.2/examples/llm). Color-coded flows indicate different types of operations: ## 🔵 Main Request Flow (Blue) The primary user journey through the system: diff --git a/docs/components/backends/llm/README.md b/docs/components/backends/llm/README.md deleted file mode 120000 index 615da9417b..0000000000 --- a/docs/components/backends/llm/README.md +++ /dev/null @@ -1 +0,0 @@ -../../../../components/backends/llm/README.md \ No newline at end of file diff --git a/docs/examples/README.md b/docs/examples/README.md index f9e22535d8..ec95678c59 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -40,9 +40,14 @@ kubectl apply -f components/backends/vllm/deploy/agg.yaml -n ${NAMESPACE} You can use `kubectl get dynamoGraphDeployment -n ${NAMESPACE}` to view your deployment. You can use `kubectl delete dynamoGraphDeployment -n ${NAMESPACE}` to delete the deployment. -We provide a Custom Resource yaml file for many examples under the `deploy/` folder. -Use [VLLM YAML](../../components/backends/vllm/deploy/agg.yaml) for an example. +We provide a Custom Resource yaml file for many examples under the `components/backends//deploy/`folder. +Consult the examples below for the CRs for your specific inference backend. +[View SGLang k8s](/components/backends/sglang/deploy/README.md) + +[View vLLM K8s](/components/backends/vllm/deploy/README.md) + +[View TRTLLM k8s](/components/backends/trtllm/deploy/README.md) **Note 1** Example Image diff --git a/docs/guides/dynamo_deploy/README.md b/docs/guides/dynamo_deploy/README.md index 516162d911..c43de3e947 100644 --- a/docs/guides/dynamo_deploy/README.md +++ b/docs/guides/dynamo_deploy/README.md @@ -17,26 +17,85 @@ limitations under the License. # Deploying Inference Graphs to Kubernetes -We expect users to deploy their inference graphs using CRDs or helm charts. + We expect users to deploy their inference graphs using CRDs or helm charts. + +# 1. Install Dynamo Cloud. + +Prior to deploying an inference graph the user should deploy the Dynamo Cloud Platform. Reference the [Quickstart Guide](quickstart.md) for steps to install Dynamo Cloud with Helm. -Prior to deploying an inference graph the user should deploy the Dynamo Cloud Platform. Dynamo Cloud acts as an orchestration layer between the end user and Kubernetes, handling the complexity of deploying your graphs for you. This is a one-time action, only necessary the first time you deploy a DynamoGraph. +# 2. Deploy your inference graph. + +We provide a Custom Resource YAML file for many examples under the components/backends/{engine}/deploy folders. Consult the examples below for the CRs for a specific inference backend. + +[View SGLang K8s](/components/backends/sglang/deploy/README.md) + +[View vLLM K8s](/components/backends/vllm/deploy/README.md) + +[View TRT-LLM K8s](/components/backends/trtllm/deploy/README.md) + +### Deploying a particular example + +```bash +# Set your dynamo root directory +cd +export PROJECT_ROOT=$(pwd) +export NAMESPACE= # the namespace you used to deploy Dynamo cloud to. +``` + +Deploying an example consists of the simple `kubectl apply -f ... -n ${NAMESPACE}` command. For example: + +```bash +kubectl apply -f components/backends/vllm/deploy/agg.yaml -n ${NAMESPACE} +``` + +You can use `kubectl get dynamoGraphDeployment -n ${NAMESPACE}` to view your deployment. +You can use `kubectl delete dynamoGraphDeployment -n ${NAMESPACE}` to delete the deployment. + +We provide a Custom Resource YAML file for many examples under the `deploy/` folder. +Use [VLLM YAML](../../components/backends/vllm/deploy/agg.yaml) for an example. + +**Note 1** Example Image + +The examples use a prebuilt image from the `nvcr.io` registry. +You can utilize public images from [Dynamo NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo) or build your own image and update the image location in your CR file prior to applying. Either way, you will need to overwrite the image in the example YAML. + +To build your own image: + +```bash +./container/build.sh --framework +``` + +For example for the `sglang` run +```bash +./container/build.sh --framework sglang +``` + +To overwrite the image in the example: + +```bash +extraPodSpec: + mainContainer: + image: +``` -# 1. Please follow [Installing Dynamo Cloud](./dynamo_cloud.md) for steps to install. -For details about the Dynamo Cloud Platform, see the [Dynamo Operator Guide](dynamo_operator.md) +**Note 2** +Setup port forward if needed when deploying to Kubernetes. -# 2. Follow [Examples](../../examples/README.md) to see how you can deploy your Inference Graphs. +List the services in your namespace: +```bash +kubectl get svc -n ${NAMESPACE} +``` +Look for one that ends in `-frontend` and use it for port forward. -## Manual Deployment with Helm Charts +```bash +SERVICE_NAME=$(kubectl get svc -n ${NAMESPACE} -o name | grep frontend | sed 's|.*/||' | sed 's|-frontend||' | head -n1) +kubectl port-forward svc/${SERVICE_NAME}-frontend 8080:8080 -n ${NAMESPACE} +``` -Users who need more control over their deployments can use the manual deployment path (`deploy/helm/`): +Additional Resources: +- [Port Forward Documentation](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/) +- [Examples Deployment Guide](../../examples/README.md#deploying-a-particular-example) -- Used for manually deploying inference graphs to Kubernetes -- Contains Helm charts and configurations for deploying individual inference pipelines -- Provides full control over deployment parameters -- Requires manual management of infrastructure components -- Documentation: - - [Using the Deployment Script](manual_helm_deployment.md#using-the-deployment-script): all-in-one script for manual deployment - - [Helm Deployment Guide](manual_helm_deployment.md#helm-deployment-guide): detailed instructions for manual deployment diff --git a/docs/guides/dynamo_deploy/operator_deployment.md b/docs/guides/dynamo_deploy/operator_deployment.md deleted file mode 120000 index 80ca4341ee..0000000000 --- a/docs/guides/dynamo_deploy/operator_deployment.md +++ /dev/null @@ -1 +0,0 @@ -../../../guides/dynamo_deploy/operator_deployment.md \ No newline at end of file diff --git a/docs/guides/dynamo_deploy/quickstart.md b/docs/guides/dynamo_deploy/quickstart.md index 5639b92f87..fd49463a43 100644 --- a/docs/guides/dynamo_deploy/quickstart.md +++ b/docs/guides/dynamo_deploy/quickstart.md @@ -67,7 +67,7 @@ Ensure you have the source code checked out and are in the `dynamo` directory: ### Set Environment Variables -Our examples use the [`nvcr.io`](https://nvcr.io/nvidia/ai-dynamo/) but you can setup your own values if you use another docker registry. +Our examples use the `nvcr.io` but you can setup your own values if you use another docker registry. ```bash export NAMESPACE=dynamo-cloud # or whatever you prefer. diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index 0453fc7ccd..9a30270dea 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -211,7 +211,7 @@ The KV-aware routing arguments: ### Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the HTTP Server and the Worker Engine. +In a Distributed System, a request may fail due to connectivity issues between the HTTP Server and the Worker Engine. The HTTP Server will automatically track which Worker Engines are having connectivity issues with it and avoid routing new requests to the Engines with known connectivity issues. @@ -482,11 +482,11 @@ The trtllm engine requires [etcd](https://etcd.io/) and [nats](https://nats.io/) ##### Step 1: Build the environment -See instructions [here](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/README.md#build-docker) to build the dynamo container with TensorRT-LLM. +See instructions [here](https://github.com/ai-dynamo/dynamo/tree/main/components/backends/trtllm#build-container) to build the dynamo container with TensorRT-LLM. ##### Step 2: Run the environment -See instructions [here](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/README.md#run-container) to run the built environment. +See instructions [here](https://github.com/ai-dynamo/dynamo/tree/main/components/backends/trtllm#run-container) to run the built environment. ##### Step 3: Execute `dynamo-run` command @@ -679,10 +679,6 @@ Here are some example engines: - Chat: * [sglang](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_sglang_tok.py) -More fully-featured Backend engines (used by `dynamo-run`): -- [vllm](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/vllm_inc.py) -- [sglang](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/sglang_inc.py) - ### Debugging `dynamo-run` and `dynamo-runtime` support [tokio-console](https://github.com/tokio-rs/console). Build with the feature to enable: diff --git a/docs/runtime/README.md b/docs/runtime/README.md index bcd29b8c70..77c0ef9cef 100644 --- a/docs/runtime/README.md +++ b/docs/runtime/README.md @@ -110,7 +110,7 @@ Annotated { data: Some("d"), id: None, event: None, comment: None } #### Python -See the [README.md](../API/python_bindings.md) for details +See the [README.md](../../lib/runtime/lib/bindings/python/README.md) for details The Python and Rust `hello_world` client and server examples are interchangeable, so you can start the Python `server.py` and talk to it from the Rust `client`. diff --git a/examples/README.md b/examples/README.md index 13fdfe5ad2..225cf13dba 100644 --- a/examples/README.md +++ b/examples/README.md @@ -22,6 +22,15 @@ This directory contains practical examples demonstrating how to deploy and use D > **Want to see a specific example?** > Open a [GitHub issue](https://github.com/ai-dynamo/dynamo/issues) to request an example you'd like to see, or [open a pull request](https://github.com/ai-dynamo/dynamo/pulls) if you'd like to contribute your own! +## Framework Support + +The /examples directory shows how Dynamo broadly works using major inference engines. + +If you want to see advanced, framework-specific deployment patterns and best practices, check out the [Components Workflows](../components/backends/) directory: +- **[vLLM](../components/backends/vllm/)** – vLLM-specific deployment and configuration +- **[SGLang](../components/backends/sglang/)** – SGLang integration examples and workflows +- **[TensorRT-LLM](../components/backends/trtllm/)** – TensorRT-LLM workflows and optimizations + ## Basics & Tutorials Learn fundamental Dynamo concepts through these introductory examples: @@ -67,13 +76,4 @@ Before running any examples, ensure you have: - **Docker & Docker Compose** - For containerized services - **CUDA-compatible GPU** - For LLM inference (except hello_world, which is non-GPU aware) - **Python 3.9++** - For client scripts and utilities -- **Kubernetes cluster** - For any cloud deployment/K8s examples - -## Framework Support - -These examples show how Dynamo broadly works using major inference engines. - -If you want to see advanced, framework-specific deployment patterns and best practices, check out the [Components Workflows](../components/backends/) directory: -- **[vLLM](../components/backends/vllm/)** – vLLM-specific deployment and configuration -- **[SGLang](../components/backends/sglang/)** – SGLang integration examples and workflows -- **[TensorRT-LLM](../components/backends/trtllm/)** – TensorRT-LLM workflows and optimizations \ No newline at end of file +- **Kubernetes cluster** - For any cloud deployment/K8s examples \ No newline at end of file diff --git a/examples/basics/disaggregated_serving/README.md b/examples/basics/disaggregated_serving/README.md index dee80fcb0f..ba501c43be 100644 --- a/examples/basics/disaggregated_serving/README.md +++ b/examples/basics/disaggregated_serving/README.md @@ -37,8 +37,8 @@ docker compose -f deploy/metrics/docker-compose.yml up -d ## Components - [Frontend](../../../components/frontend/README) - HTTP API endpoint that receives requests and forwards them to the decode worker -- [vLLM Prefill Worker](../../../components/backends/vllm/README) - Specialized worker for prefill phase execution -- [vLLM Decode Worker](../../../components/backends/vllm/README) - Specialized worker that handles requests and decides between local/remote prefill +- [vLLM Prefill Worker](../../../components/backends/vllm/README.md) - Specialized worker for prefill phase execution +- [vLLM Decode Worker](../../../components/backends/vllm/README.md) - Specialized worker that handles requests and decides between local/remote prefill ```mermaid --- diff --git a/examples/basics/multimodal/README.md b/examples/basics/multimodal/README.md deleted file mode 100644 index 693bfdeb98..0000000000 --- a/examples/basics/multimodal/README.md +++ /dev/null @@ -1,480 +0,0 @@ - - -# Multimodal Deployment Examples - -This directory provides example workflows and reference implementations for deploying a multimodal model using Dynamo. - -## Use the Latest Release - -We recommend using the latest stable release of dynamo to avoid breaking changes: - -[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) - -You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: - -```bash -git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -``` - -## Multimodal Aggregated Serving - -### Components - -- workers: For aggregated serving, we have two workers, [encode_worker](components/encode_worker.py) for encoding and [decode_worker](components/decode_worker.py) for prefilling and decoding. -- processor: Tokenizes the prompt and passes it to the decode worker. -- frontend: HTTP endpoint to handle incoming requests. - -### Graph - -In this graph, we have two workers, [encode_worker](components/encode_worker.py) and [decode_worker](components/decode_worker.py). -The encode worker is responsible for encoding the image and passing the embeddings to the decode worker via a combination of NATS and RDMA. -The work complete event is sent via NATS, while the embeddings tensor is transferred via RDMA through the NIXL interface. -Its decode worker then prefills and decodes the prompt, just like the [LLM aggregated serving](../llm/README.md) example. -By separating the encode from the prefill and decode stages, we can have a more flexible deployment and scale the -encode worker independently from the prefill and decode workers if needed. - -This figure shows the flow of the graph: -```mermaid -flowchart LR - HTTP --> processor - processor --> HTTP - processor --> decode_worker - decode_worker --> processor - decode_worker --image_url--> encode_worker - encode_worker --embeddings--> decode_worker -``` - -```bash -cd $DYNAMO_HOME/examples/multimodal -# Serve a LLaVA 1.5 7B model: -dynamo serve graphs.agg:Frontend -f ./configs/agg-llava.yaml -# Serve a Qwen2.5-VL model: -# dynamo serve graphs.agg:Frontend -f ./configs/agg-qwen.yaml -# Serve a Phi3V model: -# dynamo serve graphs.agg:Frontend -f ./configs/agg-phi3v.yaml -``` - -### Client - -In another terminal: -```bash -curl http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "llava-hf/llava-1.5-7b-hf", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://images.cocodataset.org/test2017/000000155781.jpg" - } - } - ] - } - ], - "max_tokens": 300, - "temperature": 0.0, - "stream": false - }' -``` - -If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`. - -You should see a response similar to this: -```json -{"id": "c37b946e-9e58-4d54-88c8-2dbd92c47b0c", "object": "chat.completion", "created": 1747725277, "model": "llava-hf/llava-1.5-7b-hf", "choices": [{"index": 0, "message": {"role": "assistant", "content": " In the image, there is a city bus parked on a street, with a street sign nearby on the right side. The bus appears to be stopped out of service. The setting is in a foggy city, giving it a slightly moody atmosphere."}, "finish_reason": "stop"}]} -``` - -## Multimodal Disaggregated Serving - -### Components - -- workers: For disaggregated serving, we have three workers, [encode_worker](components/encode_worker.py) for encoding, [decode_worker](components/decode_worker.py) for decoding, and [prefill_worker](components/prefill_worker.py) for prefilling. -- processor: Tokenizes the prompt and passes it to the decode worker. -- frontend: HTTP endpoint to handle incoming requests. - -### Graph - -In this graph, we have three workers, [encode_worker](components/encode_worker.py), [decode_worker](components/decode_worker.py), and [prefill_worker](components/prefill_worker.py). -For the Llava model, embeddings are only required during the prefill stage. As such, the encode worker is connected directly to the prefill worker. -The encode worker is responsible for encoding the image and passing the embeddings to the prefill worker via a combination of NATS and RDMA. -Its work complete event is sent via NATS, while the embeddings tensor is transferred via RDMA through the NIXL interface. -The prefill worker performs the prefilling step and forwards the KV cache to the decode worker for decoding. -For more details on the roles of the prefill and decode workers, refer to the [LLM disaggregated serving](../llm/README.md) example. - -This figure shows the flow of the graph: -```mermaid -flowchart LR - HTTP --> processor - processor --> HTTP - processor --> decode_worker - decode_worker --> processor - decode_worker --> prefill_worker - prefill_worker --> decode_worker - prefill_worker --image_url--> encode_worker - encode_worker --embeddings--> prefill_worker -``` - -```bash -cd $DYNAMO_HOME/examples/multimodal -dynamo serve graphs.disagg:Frontend -f configs/disagg.yaml -``` - -### Client - -In another terminal: -```bash -curl http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "llava-hf/llava-1.5-7b-hf", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://images.cocodataset.org/test2017/000000155781.jpg" - } - } - ] - } - ], - "max_tokens": 300, - "temperature": 0.0, - "stream": false - }' -``` - -You should see a response similar to this: -```json -{"id": "c1774d61-3299-4aa3-bea1-a0af6c055ba8", "object": "chat.completion", "created": 1747725645, "model": "llava-hf/llava-1.5-7b-hf", "choices": [{"index": 0, "message": {"role": "assistant", "content": " This image shows a passenger bus traveling down the road near power lines and trees. The bus displays a sign that says \"OUT OF SERVICE\" on its front."}, "finish_reason": "stop"}]} -``` - -***Note***: disaggregation is currently only confirmed to work with LLaVA. Qwen VL and PhiV are not confirmed to be supported. - -## Deployment with Dynamo Operator - -These multimodal examples can be deployed to a Kubernetes cluster using [Dynamo Cloud](../../docs/guides/dynamo_deploy/dynamo_cloud.md) and the Dynamo CLI. - -### Prerequisites - -You must have first followed the instructions in [deploy/cloud/helm/README.md](../../deploy/cloud/helm/README.md) to install Dynamo Cloud on your Kubernetes cluster. - -**Note**: The `KUBE_NS` variable in the following steps must match the Kubernetes namespace where you installed Dynamo Cloud. You must also expose the `dynamo-store` service externally. This will be the endpoint the CLI uses to interface with Dynamo Cloud. - -### Deployment Steps - -For detailed deployment instructions, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md). The following are the specific commands for the multimodal examples: - -```bash -# Set your project root directory -export PROJECT_ROOT=$(pwd) - -# Configure environment variables (see operator_deployment.md for details) -export KUBE_NS=dynamo-cloud -export DYNAMO_CLOUD=http://localhost:8080 # If using port-forward -# OR -# export DYNAMO_CLOUD=https://dynamo-cloud.nvidia.com # If using Ingress/VirtualService - -# Build the Dynamo base image (see operator_deployment.md for details) -export DYNAMO_IMAGE=/: - -# TODO: Apply Dynamo graph deployment for the example -``` - -**Note**: To avoid rate limiting from unauthenticated requests to HuggingFace (HF), you can provide your `HF_TOKEN` as a secret in your deployment. See the [operator deployment guide](../../docs/guides/dynamo_deploy/operator_deployment.md#referencing-secrets-in-your-deployment) for instructions on referencing secrets like `HF_TOKEN` in your deployment configuration. - -**Note**: Optionally add `--Planner.no-operation=false` at the end of the deployment command to enable the planner component to take scaling actions on your deployment. - -### Testing the Deployment - -Once the deployment is complete, you can test it. If you have ingress available for your deployment, you can directly call the url returned -in `dynamo deployment get ${DEPLOYMENT_NAME}` and skip the steps to find and forward the frontend pod. - -```bash -# Find your frontend pod -export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} | grep "${DEPLOYMENT_NAME}-frontend" | sort -k1 | tail -n1 | awk '{print $1}') - -# Forward the pod's port to localhost -kubectl port-forward pod/$FRONTEND_POD 8080:8080 -n ${KUBE_NS} - -# Test the API endpoint -curl localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "llava-hf/llava-1.5-7b-hf", - "messages": [ - { - "role": "user", - "content": [ - { "type": "text", "text": "What is in this image?" }, - { "type": "image_url", "image_url": { "url": "http://images.cocodataset.org/test2017/000000155781.jpg" } } - ] - } - ], - "max_tokens": 300, - "temperature": 0.0, - "stream": false - }' -``` - -If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`. - -For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md). - -## Multimodal Aggregated Video Serving - -This example demonstrates deploying an aggregated multimodal model that can process video inputs. - -### Components - -- workers: For video serving, we have two workers, [video_encode_worker](components/video_encode_worker.py) for decoding video into frames, and [video_decode_worker](components/video_decode_worker.py) for prefilling and decoding. -- processor: Tokenizes the prompt and passes it to the decode worker. -- frontend: HTTP endpoint to handle incoming requests. - -### Graph - -In this graph, we have two workers, `video_encode_worker` and `video_decode_worker`. -The `video_encode_worker` is responsible for decoding the video into a series of frames. Unlike the image pipeline which generates embeddings, this pipeline passes the raw frames directly to the `video_decode_worker`. This transfer is done efficiently using RDMA. -The `video_decode_worker` then receives these frames, and performs prefill and decode steps with the model. Separating the video processing from the language model inference allows for flexible scaling. - -This figure shows the flow of the graph: -```mermaid -flowchart LR - HTTP --> processor - processor --> HTTP - processor --> video_decode_worker - video_decode_worker --> processor - video_decode_worker --video_url--> video_encode_worker - video_encode_worker --frames--> video_decode_worker -``` - -```bash -cd $DYNAMO_HOME/examples/multimodal -# Serve a LLaVA-NeXT-Video-7B model: -dynamo serve graphs.agg_video:Frontend -f ./configs/agg_video.yaml -``` - -### Client - -In another terminal: -```bash -curl -X 'POST' 'http://localhost:8080/v1/chat/completions' -H 'Content-Type: application/json' -d '{ - "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe the video in detail" - }, - { - "type": "video_url", - "video_url": { - "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4" - } - } - ] - } - ], - "max_tokens": 300, - "stream": false - }' | jq -``` - -You should see a response describing the video's content similar to -```json -{ - "id": "b5714626-5889-4bb7-8c51-f3bca65b4683", - "object": "chat.completion", - "created": 1749772533, - "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": " Sure! The video features a group of anthropomorphic animals who appear human-like. They're out in a meadow, which is a large, open area covered in grasses, and have given human qualities like speaking and a desire to go on adventures. The animals are seen play-fighting with each other clearly seen glancing at the camera when they sense it, blinking, and Roman the second can be directly heard by the camera reciting the line, \"When the challenge becomes insane, the behavior becomes erratic.\" A white rabbit is the first in shot and he winks the left eye and flips the right ear before shaking with the mouse and squirrel friends on a blurry rock ledge under the sky. At some point, the rabbit turns towards the camera and starts playing with the thing, and there's a distant mountain in the background. Furthermore, a little animal from a tree in the background flies with two rocks, and it's joined by the rest of the group of friends. That outro is an elder turtle in the Ramden musical style saturated with a horn-like thing pattern." - }, - "finish_reason": "stop" - } - ] -} -``` - -## Multimodal Disaggregated Video Serving - -This example demonstrates deploying a disaggregated multimodal model that can process video inputs. - -### Dependency - -Video example relies on `av` package for video preprocessing inside the encode_worker. -Please install `av` inside the dynamo container to enable video example. - -`pip install av` - -### Components - -- workers: For disaggregated video serving, we have three workers, [video_encode_worker](components/video_encode_worker.py) for decoding video into frames, [video_decode_worker](components/video_decode_worker.py) for decoding, and [video_prefill_worker](components/video_prefill_worker.py) for prefilling. -- processor: Tokenizes the prompt and passes it to the decode worker. -- frontend: HTTP endpoint to handle incoming requests. - -### Graph - -In this graph, we have three workers, `video_encode_worker`, `video_decode_worker`, and `video_prefill_worker`. -For the LLaVA-NeXT-Video-7B model, frames are only required during the prefill stage. As such, the `video_encode_worker` is connected directly to the `video_prefill_worker`. -The `video_encode_worker` is responsible for decoding the video into a series of frames and passing them to the `video_prefill_worker` via RDMA. -The `video_prefill_worker` performs the prefilling step and forwards the KV cache to the `video_decode_worker` for decoding. - -This figure shows the flow of the graph: -```mermaid -flowchart LR - HTTP --> processor - processor --> HTTP - processor --> video_decode_worker - video_decode_worker --> processor - video_decode_worker --> video_prefill_worker - video_prefill_worker --> video_decode_worker - video_prefill_worker --video_url--> video_encode_worker - video_encode_worker --frames--> video_prefill_worker -``` - -```bash -cd $DYNAMO_HOME/examples/multimodal -# Serve a LLaVA-NeXT-Video-7B model: -dynamo serve graphs.disagg_video:Frontend -f ./configs/disagg_video.yaml -``` - -### Client - -In another terminal: -```bash -curl -X 'POST' 'http://localhost:8080/v1/chat/completions' -H 'Content-Type: application/json' -d '{ - "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe the video in detail" - }, - { - "type": "video_url", - "video_url": { - "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4" - } - } - ] - } - ], - "max_tokens": 300, - "stream": false - }' | jq -``` - -You should see a response describing the video's content similar to -```json -{ - "id": "d1d641b1-4daf-48d3-9d06-6a60743b5a42", - "object": "chat.completion", - "created": 1749775300, - "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": " The video features two animals in a lush, green outdoor environment. On the ground, there is a white rabbit with big brown eyes, a playful expression, and two antlers. The rabbit is accompanied by a uniquely colored bird with orange pupils, possibly a squirrel or a hamster, sitting on its head. These two animals seem to have embarked on an unlikely journey, flying together in the sky. The backdrop showcases rolling green hills and trees under the pleasant weather. The sky is clear, indicating a beautiful day. The colors and contrast suggest the landscape is during spring or summer, signifying the rabbit and bird could also be engaging in outdoor activities during those seasons. Overall, it's a charming scene depicting an unlikely yet harmonious pair, enjoying a surprise adventure in nature." - }, - "finish_reason": "stop" - } - ] -} -``` - - -## Deploying Multimodal Examples on Kubernetes - -This guide will help you quickly deploy and clean up the multimodal example services in Kubernetes. - -### Prerequisites - -- **Dynamo Cloud** is already deployed in your target Kubernetes namespace. -- You have `kubectl` access to your cluster and the correct namespace set in `$NAMESPACE`. - - -### Create a secret with huggingface token - -```bash -export HF_TOKEN="huggingfacehub token with read permission to models" -kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n $KUBE_NS || true -``` - ---- - -Choose the example you want to deploy or delete. The YAML files are located in `examples/multimodal/deploy/k8s/`. - -### Deploy the Multimodal Example - -```bash -kubectl apply -f examples/multimodal/deploy/k8s/ -n $NAMESPACE -``` - -### Uninstall the Multimodal Example - - -```bash -kubectl delete -f examples/multimodal/deploy/k8s/ -n $NAMESPACE -``` - -### Using a different dynamo container - -To customize the container image used in your deployment, you will need to update the manifest before applying it. - -You can use [`yq`](https://github.com/mikefarah/yq?tab=readme-ov-file#install), a portable command-line YAML processor. - -Please follow the [installation instructions](https://github.com/mikefarah/yq?tab=readme-ov-file#install) for your platform if you do not already have `yq` installed. After installing `yq`, you can generate and apply your manifest as follows: - - -```bash -export DYNAMO_IMAGE=my-registry/my-image:tag - -yq '.spec.services.[].extraPodSpec.mainContainer.image = env(DYNAMO_IMAGE)' $EXAMPLE_FILE > my_example_manifest.yaml - -# install the dynamo example -kubectl apply -f my_example_manifest.yaml -n $NAMESPACE - -# uninstall the dynamo example -kubectl delete -f my_example_manifest.yaml -n $NAMESPACE - -``` \ No newline at end of file diff --git a/examples/basics/multinode/README.md b/examples/basics/multinode/README.md index 9959899648..fadd8af294 100644 --- a/examples/basics/multinode/README.md +++ b/examples/basics/multinode/README.md @@ -85,7 +85,7 @@ Install Dynamo with [SGLang](https://docs.sglang.ai/) support: pip install ai-dynamo[sglang] ``` -For more information about the SGLang backend and its integration with Dynamo, see the [SGLang Backend Documentation](../../components/backends/sglang/README.md). +For more information about the SGLang backend and its integration with Dynamo, see the [SGLang Backend Documentation](../../../components/backends/sglang/README.md). ### 3. Network Requirements diff --git a/examples/basics/quickstart/README.md b/examples/basics/quickstart/README.md index 694243d5d6..99dc405a0f 100644 --- a/examples/basics/quickstart/README.md +++ b/examples/basics/quickstart/README.md @@ -18,7 +18,7 @@ docker compose -f deploy/metrics/docker-compose.yml up -d ## Components - [Frontend](../../../components/frontend/README) - A built-in component that launches an OpenAI compliant HTTP server, a pre-processor, and a router in a single process -- [vLLM Backend](../../../components/backends/vllm/README) - A built-in component that runs vLLM within the Dynamo runtime +- [vLLM Backend](../../../components/backends/vllm/README.md) - A built-in component that runs vLLM within the Dynamo runtime ```mermaid --- diff --git a/examples/deployments/EKS/Deploy_VLLM_example.md b/examples/deployments/EKS/Deploy_VLLM_example.md index dd4f027da8..b395781ed5 100644 --- a/examples/deployments/EKS/Deploy_VLLM_example.md +++ b/examples/deployments/EKS/Deploy_VLLM_example.md @@ -25,8 +25,8 @@ dynamo-cloud vllm-agg-router-vllmdecodeworker-787d575485-zkwdd Test the Deployment ``` -kubectl port-forward deployment/vllm-agg-router-frontend 8080:8000 -n dynamo-cloud -curl localhost:8080/v1/chat/completions \ +kubectl port-forward deployment/vllm-agg-router-frontend 8000:8000 -n dynamo-cloud +curl localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "Qwen/Qwen3-0.6B", diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock index 0c60e0e802..d677224444 100644 --- a/lib/bindings/python/Cargo.lock +++ b/lib/bindings/python/Cargo.lock @@ -2912,9 +2912,9 @@ dependencies = [ [[package]] name = "nixl-sys" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97f621270fd1ed8af5a8028a1945e6f7e612a38836ce82b720fe54222739df3c" +checksum = "743ed1038b386b75451f9e0bba37cb2e3eea75873635268337d6531be99c9303" dependencies = [ "bindgen", "cc", diff --git a/lib/llm/Cargo.toml b/lib/llm/Cargo.toml index 062faf46f2..b764087194 100644 --- a/lib/llm/Cargo.toml +++ b/lib/llm/Cargo.toml @@ -89,7 +89,7 @@ rayon = "1" dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] } # block_manager -nixl-sys = {version = "0.4.0", optional = true } +nixl-sys = {version = "0.4.1", optional = true } cudarc = { version = "0.16.2", features = ["cuda-12020"], optional = true } ndarray = { version = "0.16", optional = true } nix = { version = "0.26", optional = true } diff --git a/pyproject.toml b/pyproject.toml index 32c6ff5993..df0b640cfe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,18 +61,19 @@ Repository = "https://github.com/ai-dynamo/dynamo.git" [project.optional-dependencies] trtllm =[ "uvloop", - "tensorrt-llm==1.0.0rc4" + "tensorrt-llm==1.0.0rc4", + "triton==3.3.1", # locking triton as version 3.4.0 breaks tensorrt-llm 1.0.0rc4 ] vllm = [ "uvloop", - "nixl", + "nixl<=0.4.1", "vllm==0.10.0", ] sglang = [ "uvloop", - "nixl", + "nixl<=0.4.1", "sglang[all]==0.4.9.post6", ]