diff --git a/ATTRIBUTIONS-Go.md b/ATTRIBUTIONS-Go.md index ee06c0f790..21115e8da8 100644 --- a/ATTRIBUTIONS-Go.md +++ b/ATTRIBUTIONS-Go.md @@ -10376,5 +10376,450 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +### github.com/blang/semver/v4 + +License Identifier: MIT +License Text: +``` +The MIT License + +Copyright (c) 2014 Benedikt Lang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` + +### sigs.k8s.io/randfill + +License Identifier: Apache 2.0 +License Text: +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2014 The gofuzz Authors + Copyright 2025 The Kubernetes Authors + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +``` + +### github.com/NVIDIA/grove/operator/api + +License Identifier: Apache 2.0 +License Text: +``` + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2024 The Grove Authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ``` diff --git a/Cargo.lock b/Cargo.lock index b9a4093e95..d48d17aa39 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1847,7 +1847,7 @@ dependencies = [ [[package]] name = "dynamo-engine-llamacpp" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "async-stream", "dynamo-llm", @@ -1859,7 +1859,7 @@ dependencies = [ [[package]] name = "dynamo-engine-mistralrs" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "anyhow", "async-openai", @@ -1877,7 +1877,7 @@ dependencies = [ [[package]] name = "dynamo-llm" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "ahash", "akin", @@ -1954,7 +1954,7 @@ dependencies = [ [[package]] name = "dynamo-run" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "anyhow", "async-openai", @@ -1983,7 +1983,7 @@ dependencies = [ [[package]] name = "dynamo-runtime" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "anyhow", "arc-swap", @@ -2039,7 +2039,7 @@ dependencies = [ [[package]] name = "dynamo-tokens" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "bytemuck", "derive-getters", @@ -3808,7 +3808,7 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libdynamo_llm" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "anyhow", "async-once-cell", @@ -4105,7 +4105,7 @@ dependencies = [ [[package]] name = "metrics" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "axum 0.6.20", "clap 4.5.40", @@ -4561,9 +4561,9 @@ dependencies = [ [[package]] name = "nixl-sys" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97f621270fd1ed8af5a8028a1945e6f7e612a38836ce82b720fe54222739df3c" +checksum = "743ed1038b386b75451f9e0bba37cb2e3eea75873635268337d6531be99c9303" dependencies = [ "bindgen 0.71.1", "cc", @@ -5921,7 +5921,7 @@ dependencies = [ [[package]] name = "router" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "clap 4.5.40", "dynamo-llm", diff --git a/Cargo.toml b/Cargo.toml index a17bc6ce27..a0ff771d95 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ members = [ resolver = "3" [workspace.package] -version = "0.4.0" +version = "0.4.0+post0" edition = "2021" description = "Dynamo Inference Framework" authors = ["NVIDIA Inc. "] diff --git a/README.md b/README.md index 759a9187d8..4369dc8142 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,30 @@ limitations under the License. [![Discord](https://dcbadge.limes.pink/api/server/D92uqZRjCZ?style=flat)](https://discord.gg/D92uqZRjCZ) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/ai-dynamo/dynamo) -| **[Roadmap](https://github.com/ai-dynamo/dynamo/issues/762)** | **[Documentation](https://docs.nvidia.com/dynamo/latest/index.html)** | **[Examples](https://github.com/ai-dynamo/dynamo/tree/main/examples)** | **[Design Proposals](https://github.com/ai-dynamo/enhancements)** | +| **[Roadmap](https://github.com/ai-dynamo/dynamo/issues/762)** | **[Documentation](https://docs.nvidia.com/dynamo/latest/index.html)** | **[Support Matrix](docs/support_matrix.md)** | **[Examples](https://github.com/ai-dynamo/dynamo/tree/main/examples)** | **[Design Proposals](https://github.com/ai-dynamo/enhancements)** | # NVIDIA Dynamo High-throughput, low-latency inference framework designed for serving generative AI and reasoning models in multi-node distributed environments. +## Framework Support Matrix + +| Feature | vLLM | SGLang | TensorRT-LLM | +|---------|----------------------|----------------------------|----------------------------------------| +| [**Disaggregated Serving**](/docs/architecture/disagg_serving.md) | βœ… | βœ… | βœ… | +| [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | 🚧 | 🚧 | +| [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md) | βœ… | βœ… | βœ… | +| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | βœ… | 🚧 | 🚧 | +| [**Load Based Planner**](/docs/architecture/load_planner.md) | βœ… | 🚧 | 🚧 | +| [**KVBM**](/docs/architecture/kvbm_architecture.md) | 🚧 | 🚧 | 🚧 | + +To learn more about each framework and their capabilities, check out each framework's README and deploy them with Dynamo! +- **[vLLM](components/backends/vllm/README.md)** +- **[SGLang](components/backends/sglang/README.md)** +- **[TensorRT-LLM](components/backends/trtllm/README.md)** + +Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, OSS (Open Source Software) first development approach. + ## The Era of Multi-GPU, Multi-Node

@@ -47,24 +65,6 @@ Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLa Dynamo architecture

-## Framework Support Matrix - -| Feature | vLLM | SGLang | TensorRT-LLM | -|---------|----------------------|----------------------------|----------------------------------------| -| [**Disaggregated Serving**](/docs/architecture/disagg_serving.md) | βœ… | βœ… | βœ… | -| [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | 🚧 | 🚧 | -| [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md) | βœ… | βœ… | βœ… | -| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | βœ… | 🚧 | 🚧 | -| [**Load Based Planner**](/docs/architecture/load_planner.md) | βœ… | 🚧 | 🚧 | -| [**KVBM**](/docs/architecture/kvbm_architecture.md) | 🚧 | 🚧 | 🚧 | - -To learn more about each framework and their capabilities, check out each framework's README! -- **[vLLM](components/backends/vllm/README.md)** -- **[SGLang](components/backends/sglang/README.md)** -- **[TensorRT-LLM](components/backends/trtllm/README.md)** - -Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, OSS (Open Source Software) first development approach. - # Installation The following examples require a few system level packages. @@ -115,11 +115,11 @@ Dynamo provides a simple way to spin up a local set of inference components incl ``` # Start an OpenAI compatible HTTP server, a pre-processor (prompt templating and tokenization) and a router: -python -m dynamo.frontend [--http-port 8080] +python -m dynamo.frontend --http-port 8080 # Start the SGLang engine, connecting to NATS and etcd to receive requests. You can run several of these, # both for the same model and for multiple models. The frontend node will discover them. -python -m dynamo.sglang.worker deepseek-ai/DeepSeek-R1-Distill-Llama-8B +python -m dynamo.sglang.worker --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --skip-tokenizer-init ``` #### Send a Request @@ -167,10 +167,15 @@ To specify which GPUs to use set environment variable `CUDA_VISIBLE_DEVICES`. ## SGLang + ``` -# Install libnuma +# Install libnuma-dev apt install -y libnuma-dev +# Install flashinfer-python pre-release (required by sglang for optimized inference) +uv pip install "flashinfer-python==0.2.9rc2" --prerelease=allow + +# Install ai-dynamo with sglang support uv pip install ai-dynamo[sglang] ``` diff --git a/benchmarks/llm/README.md b/benchmarks/llm/README.md index e0cb8e976d..614dbd9be4 100644 --- a/benchmarks/llm/README.md +++ b/benchmarks/llm/README.md @@ -12,4 +12,3 @@ See the License for the specific language governing permissions and limitations under the License. --> -[../../examples/llm/benchmarks/README.md](../../examples/llm/benchmarks/README.md) diff --git a/components/README.md b/components/README.md index 2c5677eae7..3f638f5371 100644 --- a/components/README.md +++ b/components/README.md @@ -77,4 +77,4 @@ To get started with Dynamo components: 4. **Run deployment scripts** from the engine's launch directory 5. **Monitor performance** using the metrics component -For detailed instructions, see the README files in each component directory and the main [Dynamo documentation](../../docs/). +For detailed instructions, see the README files in each component directory and the main [Dynamo documentation](../docs/). diff --git a/components/backends/llama_cpp/README.md b/components/backends/llama_cpp/README.md index f7c9e6520e..78a553c0c1 100644 --- a/components/backends/llama_cpp/README.md +++ b/components/backends/llama_cpp/README.md @@ -13,7 +13,7 @@ python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args] ## Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend. +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md index ffb58e76a0..9a9dec088c 100644 --- a/components/backends/sglang/README.md +++ b/components/backends/sglang/README.md @@ -34,26 +34,25 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) | Feature | SGLang | Notes | |---------|--------|-------| -| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | βœ… | | -| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) | -| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | βœ… | | -| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | ❌ | Planned | -| [**Load Based Planner**](../../docs/architecture/load_planner.md) | ❌ | Planned | -| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | ❌ | Planned | +| [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | βœ… | | +| [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) | +| [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | βœ… | | +| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | ❌ | Planned | +| [**Load Based Planner**](../../../docs/architecture/load_planner.md) | ❌ | Planned | +| [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | ❌ | Planned | ### Large Scale P/D and WideEP Features -| Feature | SGLang | Notes | -|--------------------|--------|-----------------------------------------------------------------------| -| **WideEP** | βœ…/🚧 | Full support on H100s/GB200 WIP [PR](https://github.com/sgl-project/sglang/pull/7556) | -| **DP Rank Routing**| 🚧 | Direct routing supported. Process per DP rank is not supported | -| **GB200 Support** | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7556) | +| Feature | SGLang | Notes | +|---------------------|--------|--------------------------------------------------------------| +| **WideEP** | βœ… | Full support on H100s/GB200 | +| **DP Rank Routing** | 🚧 | Direct routing supported. Dynamo KV router does not router to DP worker | +| **GB200 Support** | βœ… | | ## Quick Start -Below we provide a guide that lets you run all of our the common deployment patterns on a single node. See our different [architectures](../llm/README.md#deployment-architectures) for a high level overview of each pattern and the architecture diagram for each. - +Below we provide a guide that lets you run all of our the common deployment patterns on a single node. ### Start NATS and ETCD in the background Start using [Docker Compose](../../../deploy/docker-compose.yml) @@ -141,7 +140,7 @@ cd $DYNAMO_ROOT/components/backends/sglang ## Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend. +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. @@ -164,7 +163,6 @@ Below we provide a selected list of advanced examples. Please open up an issue i ### Large scale P/D disaggregation with WideEP - **[Run DeepSeek-R1 on 104+ H100s](docs/dsr1-wideep-h100.md)** -- **[Run DeepSeek-R1 on GB200s](docs/dsr1-wideep-gb200.md)** ### Speculative Decoding - **[Deploying DeepSeek-R1 with MTP - coming soon!](.)** diff --git a/components/backends/sglang/deploy/README.md b/components/backends/sglang/deploy/README.md new file mode 100644 index 0000000000..c631951334 --- /dev/null +++ b/components/backends/sglang/deploy/README.md @@ -0,0 +1,162 @@ +# SGLang Kubernetes Deployment Configurations + +This directory contains Kubernetes Custom Resource Definition (CRD) templates for deploying SGLang inference graphs using the **DynamoGraphDeployment** resource. + +## Available Deployment Patterns + +### 1. **Aggregated Deployment** (`agg.yaml`) +Basic deployment pattern with frontend and a single decode worker. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server +- `SGLangDecodeWorker`: Single worker handling both prefill and decode + +### 2. **Aggregated Router Deployment** (`agg_router.yaml`) +Enhanced aggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server with router mode enabled (`--router-mode kv`) +- `SGLangDecodeWorker`: Single worker handling both prefill and decode + +### 3. **Disaggregated Deployment** (`disagg.yaml`)** +High-performance deployment with separated prefill and decode workers. + +**Architecture:** +- `Frontend`: HTTP API server coordinating between workers +- `SGLangDecodeWorker`: Specialized decode-only worker (`--disaggregation-mode decode`) +- `SGLangPrefillWorker`: Specialized prefill-only worker (`--disaggregation-mode prefill`) +- Communication via NIXL transfer backend (`--disaggregation-transfer-backend nixl`) + +## CRD Structure + +All templates use the **DynamoGraphDeployment** CRD: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: +spec: + services: + : + # Service configuration +``` + +### Key Configuration Options + +**Resource Management:** +```yaml +resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" +``` + +**Container Configuration:** +```yaml +extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + # Model-specific arguments +``` + +## Prerequisites + +Before using these templates, ensure you have: + +1. **Dynamo Cloud Platform installed** - See [Installing Dynamo Cloud](../../../../docs/guides/dynamo_deploy/dynamo_cloud.md) +2. **Kubernetes cluster with GPU support** +3. **Container registry access** for SGLang runtime images +4. **HuggingFace token secret** (referenced as `envFromSecret: hf-token-secret`) + +## Usage + +### 1. Choose Your Template +Select the deployment pattern that matches your requirements: +- Use `agg.yaml` for development/testing +- Use `agg_router.yaml` for production with load balancing +- Use `disagg.yaml` for maximum performance + +### 2. Customize Configuration +Edit the template to match your environment: + +```yaml +# Update image registry and tag +image: your-registry/sglang-runtime:your-tag + +# Configure your model +args: + - "--model-path" + - "your-org/your-model" + - "--served-model-name" + - "your-org/your-model" +``` + +### 3. Deploy + +Use the following command to deploy the deployment file. + +First, create a secret for the HuggingFace token. +```bash +export HF_TOKEN=your_hf_token +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + +Then, deploy the model using the deployment file. + +```bash +export DEPLOYMENT_FILE=agg.yaml +kubectl apply -f $DEPLOYMENT_FILE -n ${NAMESPACE} +``` + +### 4. Using Custom Dynamo Frameworks Image for SGLang + +To use a custom dynamo frameworks image for SGLang, you can update the deployment file using yq: + +```bash +export DEPLOYMENT_FILE=agg.yaml +export FRAMEWORK_RUNTIME_IMAGE= + +yq '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE > $DEPLOYMENT_FILE.generated +kubectl apply -f $DEPLOYMENT_FILE.generated -n $NAMESPACE +``` + +## Model Configuration + +All templates use **DeepSeek-R1-Distill-Llama-8B** as the default model. But you can use any sglang argument and configuration. Key parameters: + +## Monitoring and Health + +- **Frontend health endpoint**: `http://:8000/health` +- **Liveness probes**: Check process health every 60s + +## Further Reading + +- **Deployment Guide**: [Creating Kubernetes Deployments](../../../../docs/guides/dynamo_deploy/create_deployment.md) +- **Quickstart**: [Deployment Quickstart](../../../../docs/guides/dynamo_deploy/quickstart.md) +- **Platform Setup**: [Dynamo Cloud Installation](../../../../docs/guides/dynamo_deploy/dynamo_cloud.md) +- **Examples**: [Deployment Examples](../../../../docs/examples/README.md) +- **Kubernetes CRDs**: [Custom Resources Documentation](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/) + +## Troubleshooting + +Common issues and solutions: + +1. **Pod fails to start**: Check image registry access and HuggingFace token secret +2. **GPU not allocated**: Verify cluster has GPU nodes and proper resource limits +3. **Health check failures**: Review model loading logs and increase `initialDelaySeconds` +4. **Out of memory**: Increase memory limits or reduce model batch size + +For additional support, refer to the [deployment troubleshooting guide](../../../../docs/guides/dynamo_deploy/quickstart.md#troubleshooting). diff --git a/components/backends/sglang/deploy/disagg.yaml b/components/backends/sglang/deploy/disagg.yaml index 06c4b842d2..aa90223486 100644 --- a/components/backends/sglang/deploy/disagg.yaml +++ b/components/backends/sglang/deploy/disagg.yaml @@ -83,7 +83,7 @@ spec: args: - "python3" - "-m" - - "dynamo.sglang.worker" + - "dynamo.sglang.decode_worker" - "--model-path" - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" - "--served-model-name" @@ -152,4 +152,4 @@ spec: - "--disaggregation-mode" - "prefill" - "--disaggregation-transfer-backend" - - "nixl" \ No newline at end of file + - "nixl" diff --git a/components/backends/sglang/docs/dsr1-wideep-h100.md b/components/backends/sglang/docs/dsr1-wideep-h100.md index d766bc3edf..6cfcace10d 100644 --- a/components/backends/sglang/docs/dsr1-wideep-h100.md +++ b/components/backends/sglang/docs/dsr1-wideep-h100.md @@ -5,26 +5,18 @@ SPDX-License-Identifier: Apache-2.0 # Running DeepSeek-R1 Disaggregated with WideEP on H100s -Dynamo supports SGLang's implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://www.nvidia.com/en-us/technologies/ai/deepseek-r1-large-scale-p-d-with-wide-expert-parallelism/) for more details. We provide a Dockerfile for this in `container/Dockerfile.sglang-deepep` and configurations to deploy this at scale. In this example, we will run 1 prefill worker on 4 H100 nodes and 1 decode worker on 9 H100 nodes (104 total GPUs). +Dynamo supports SGLang's implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-05-05-large-scale-ep/) for more details. We provide a Dockerfile for this in `container/Dockerfile.sglang-deepep` and configurations to deploy this at scale. In this example, we will run 1 prefill worker on 4 H100 nodes and 1 decode worker on 9 H100 nodes (104 total GPUs). ## Instructions -1. Pull the SGLang container. - -```bash -docker pull lmsysorg/sglang:latest -``` - -You can also pull a specific tag from the [lmsys dockerhub](https://hub.docker.com/r/lmsysorg/sglang/tags) - -2. Build the Dynamo container +1. Build the Dynamo container ```bash cd $DYNAMO_ROOT docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache ``` -3. You can run this container on each 8xH100 node using the following command. +2. You can run this container on each 8xH100 node using the following command. > [!IMPORTANT] > We recommend downloading DeepSeek-R1 and then mounting it to the container. You can find the model [here](https://huggingface.co/deepseek-ai/DeepSeek-R1) @@ -47,19 +39,19 @@ docker run \ In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory. -4. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. +3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. ```bash -./utils/gen_env_vars.sh +./components/backends/sglang/src/dynamo/sglang/utils/gen_env_vars.sh ``` -5. Run the ingress and prefill worker +4. Run the ingress and prefill worker ```bash # run ingress -dynamo run in=http out=dyn & +python3 -m dynamo.frontend --http-port=8000 & # optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below) -python3 utils/sgl_http_server.py --ns dynamo & +python3 -m dynamo.sglang.utils.sgl_http_server --ns dynamo & # run prefill worker python3 -m dynamo.sglang.worker \ --model-path /model/ \ @@ -93,7 +85,7 @@ python3 -m dynamo.sglang.worker \ On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3 -7. Run the decode worker on the head decode node +5. Run the decode worker on the head decode node ```bash python3 -m dynamo.sglang.decode_worker \ @@ -121,7 +113,7 @@ python3 -m dynamo.sglang.decode_worker \ --deepep-mode low_latency \ --mem-fraction-static 0.835 \ --ep-num-redundant-experts 32 \ - --cuda-graph-bs 256 + --cuda-graph-bs 128 ``` On the other decode nodes (this example has 9 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, and 8 @@ -131,6 +123,7 @@ On the other decode nodes (this example has 9 total decode nodes), run the same In the official [blog post repro instructions](https://github.com/sgl-project/sglang/issues/6017), SGL uses batch inference to benchmark their prefill and decode workers. They do this by pretokenizing the ShareGPT dataset and then creating a batch of 8192 requests with ISL 4096 and OSL 5 (for prefill stress test) and a batch of 40000 with ISL 2000 and OSL 100 (for decode stress test). If you want to repro these benchmarks, you will need to add the following flags to the prefill and decode commands: prefill: + ```bash ... --max-running-requests 8192 \ @@ -142,6 +135,7 @@ prefill: ``` decode: + ```bash ... --max-running-requests 18432 \ @@ -152,9 +146,10 @@ decode: We currently provide 2 different ways to perform an end to end benchmark which includes using our OpenAI frontend and tokenization. We will continue to add better support for these sorts of large single batch workloads in the future. 1. **GenAI Perf to benchmark end to end performance with 8k ISL 256 OSL** -We've found that 8k ISL 256 OSL provides a good baseline for measuring end to end disaggregated serving performance for DSR1. As WideEP allows for a higher throughput, we provide a script that runs this workload at high concurrencies. DeepGEMM kernels can sometimes take a while to warm up. We provide a short ramping warmup script that can be used. + We've found that 8k ISL 256 OSL provides a good baseline for measuring end to end disaggregated serving performance for DSR1. As WideEP allows for a higher throughput, we provide a script that runs this workload at high concurrencies. DeepGEMM kernels can sometimes take a while to warm up. We provide a short ramping warmup script that can be used. Example usage: + ```bash # warmup ./utils/bench.sh HEAD_PREFILL_NODE_IP --type warmup @@ -165,9 +160,10 @@ curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache ``` 2. **GenAI Perf to benchmark completions with custom dataset** -We provide a script that generates a JSONL file of the ShareGPT dataset and then use GenAI Perf to benchmark the prefill and decode workers. We use ShareGPT in order to leverage the pre-existing EPLB distributions provided by the SGLang team. If you don't want to use ShareGPT - you can also use GenAIPerf's synthetic dataset setup But note you will have to use dynamic EPLB configurations or record your own as the `init-expert-location` provided by SGLang is tuned specifically for the ShareGPT dataset at a 4096 ISL and 5 OSL. + We provide a script that generates a JSONL file of the ShareGPT dataset and then use GenAI Perf to benchmark the prefill and decode workers. We use ShareGPT in order to leverage the pre-existing EPLB distributions provided by the SGLang team. If you don't want to use ShareGPT - you can also use GenAIPerf's synthetic dataset setup But note you will have to use dynamic EPLB configurations or record your own as the `init-expert-location` provided by SGLang is tuned specifically for the ShareGPT dataset at a 4096 ISL and 5 OSL. Example usage: + ```bash # generate data python3 src/dynamo/sglang/utils/generate_bench_data.py --output data.jsonl --num-prompts 8192 --input-len 4096 --output-len 5 --model deepseek-ai/DeepSeek-R1 diff --git a/components/backends/sglang/docs/multinode-examples.md b/components/backends/sglang/docs/multinode-examples.md index 2bc0a802ff..d6ae5e32e0 100644 --- a/components/backends/sglang/docs/multinode-examples.md +++ b/components/backends/sglang/docs/multinode-examples.md @@ -19,7 +19,7 @@ SGLang allows you to deploy multi-node sized models by adding in the `dist-init- Node 1: Run HTTP ingress, processor, and 8 shards of the prefill worker ```bash # run ingress -dynamo run in=http out=dyn & +python3 -m dynamo.frontend --http-port=8000 & # run prefill worker python3 -m dynamo.sglang.worker \ --model-path /model/ \ @@ -102,7 +102,7 @@ SGLang typically requires a warmup period to ensure the DeepGEMM kernels are loa curl ${HEAD_PREFILL_NODE_IP}:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "model": "deepseek-ai/DeepSeek-R1", "messages": [ { "role": "user", diff --git a/components/backends/sglang/docs/sgl-http-server.md b/components/backends/sglang/docs/sgl-http-server.md index 0d87b760c3..28e2b2400a 100644 --- a/components/backends/sglang/docs/sgl-http-server.md +++ b/components/backends/sglang/docs/sgl-http-server.md @@ -74,7 +74,7 @@ The server accepts the following command-line arguments: Start the server: ```bash -python src/dynamo/sglang/utils/sgl_http_server.py --port 9001 --namespace dynamo +python3 -m dynamo.sglang.utils.sgl_http_server --ns dynamo ``` The server will automatically discover all SGLang components in the specified namespace and provide HTTP endpoints for managing them. diff --git a/components/backends/sglang/launch/agg_router.sh b/components/backends/sglang/launch/agg_router.sh index b45509235c..46a0eff19d 100755 --- a/components/backends/sglang/launch/agg_router.sh +++ b/components/backends/sglang/launch/agg_router.sh @@ -15,7 +15,8 @@ trap cleanup EXIT INT TERM python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo # run ingress -dynamo run in=http out=dyn --router-mode kv --http-port=8000 & +# run ingress +python -m dynamo.frontend --router-mode kv --http-port=8000 & DYNAMO_PID=$! # run worker diff --git a/components/backends/sglang/slurm_jobs/README.md b/components/backends/sglang/slurm_jobs/README.md index 19f7c27ada..da930f7b15 100644 --- a/components/backends/sglang/slurm_jobs/README.md +++ b/components/backends/sglang/slurm_jobs/README.md @@ -1,108 +1 @@ -# Example: Deploy Multi-node SGLang with Dynamo on SLURM - -This folder implements the example of [SGLang DeepSeek-R1 Disaggregated with WideEP](../dsr1-wideep.md) on a SLURM cluster. - -## Overview - -The scripts in this folder set up multiple cluster nodes to run the [SGLang DeepSeek-R1 Disaggregated with WideEP](../dsr1-wideep.md) example, with separate nodes handling prefill and decode. -The node setup is done using Python job submission scripts with Jinja2 templates for flexible configuration. The setup also includes GPU utilization monitoring capabilities to track performance during benchmarks. - -## Scripts - -- **`submit_job_script.py`**: Main script for generating and submitting SLURM job scripts from templates -- **`job_script_template.j2`**: Jinja2 template for generating SLURM job scripts -- **`scripts/worker_setup.py`**: Worker script that handles the setup on each node -- **`scripts/monitor_gpu_utilization.sh`**: Script for monitoring GPU utilization during benchmarks - -## Logs Folder Structure - -Each SLURM job creates a unique log directory under `logs/` using the job ID. For example, job ID `3062824` creates the directory `logs/3062824/`. - -### Log File Structure - -``` -logs/ -β”œβ”€β”€ 3062824/ # Job ID directory -β”‚ β”œβ”€β”€ log.out # Main job output (node allocation, IP addresses, launch commands) -β”‚ β”œβ”€β”€ log.err # Main job errors -β”‚ β”œβ”€β”€ node0197_prefill.out # Prefill node stdout (node0197) -β”‚ β”œβ”€β”€ node0197_prefill.err # Prefill node stderr (node0197) -β”‚ β”œβ”€β”€ node0200_prefill.out # Prefill node stdout (node0200) -β”‚ β”œβ”€β”€ node0200_prefill.err # Prefill node stderr (node0200) -β”‚ β”œβ”€β”€ node0201_decode.out # Decode node stdout (node0201) -β”‚ β”œβ”€β”€ node0201_decode.err # Decode node stderr (node0201) -β”‚ β”œβ”€β”€ node0204_decode.out # Decode node stdout (node0204) -β”‚ β”œβ”€β”€ node0204_decode.err # Decode node stderr (node0204) -β”‚ β”œβ”€β”€ node0197_prefill_gpu_utilization.log # GPU utilization monitoring (node0197) -β”‚ β”œβ”€β”€ node0200_prefill_gpu_utilization.log # GPU utilization monitoring (node0200) -β”‚ β”œβ”€β”€ node0201_decode_gpu_utilization.log # GPU utilization monitoring (node0201) -β”‚ └── node0204_decode_gpu_utilization.log # GPU utilization monitoring (node0204) -β”œβ”€β”€ 3063137/ # Another job ID directory -β”œβ”€β”€ 3062689/ # Another job ID directory -└── ... -``` - -## Setup - -For simplicity of the example, we will make some assumptions about your SLURM cluster: -1. We assume you have access to a SLURM cluster with multiple GPU nodes - available. For functional testing, most setups should be fine. For performance - testing, you should aim to allocate groups of nodes that are performantly - inter-connected, such as those in an NVL72 setup. -2. We assume this SLURM cluster has the [Pyxis](https://github.com/NVIDIA/pyxis) - SPANK plugin setup. In particular, the `job_script_template.j2` template in this - example will use `srun` arguments like `--container-image`, - `--container-mounts`, and `--container-env` that are added to `srun` by Pyxis. - If your cluster supports similar container based plugins, you may be able to - modify the template to use that instead. -3. We assume you have already built a recent Dynamo+SGLang container image as - described [here](../dsr1-wideep.md#instructions). - This is the image that can be passed to the `--container-image` argument in later steps. - -## Usage - -1. **Submit a benchmark job**: - ```bash - python submit_job_script.py \ - --template job_script_template.j2 \ - --model-dir /path/to/model \ - --config-dir /path/to/configs \ - --container-image container-image-uri \ - --account your-slurm-account - ``` - - **Required arguments**: - - `--template`: Path to Jinja2 template file - - `--model-dir`: Model directory path - - `--config-dir`: Config directory path - - `--container-image`: Container image URI (e.g., `registry/repository:tag`) - - `--account`: SLURM account - - **Optional arguments**: - - `--prefill-nodes`: Number of prefill nodes (default: `2`) - - `--decode-nodes`: Number of decode nodes (default: `2`) - - `--gpus-per-node`: Number of GPUs per node (default: `8`) - - `--network-interface`: Network interface to use (default: `eth3`) - - `--job-name`: SLURM job name (default: `dynamo_setup`) - - `--time-limit`: Time limit in HH:MM:SS format (default: `01:00:00`) - - **Note**: The script automatically calculates the total number of nodes needed based on `--prefill-nodes` and `--decode-nodes` parameters. - -2. **Monitor job progress**: - ```bash - squeue -u $USER - ``` - -3. **Check logs in real-time**: - ```bash - tail -f logs/{JOB_ID}/log.out - ``` - -4. **Monitor GPU utilization**: - ```bash - tail -f logs/{JOB_ID}/{node}_prefill_gpu_utilization.log - ``` - -## Outputs - -Benchmark results and outputs are stored in the `outputs/` directory, which is mounted into the container. +Please refer to [Deploying Dynamo with SGLang on SLURM](../../../../docs/components/backends/sglang/slurm_jobs/README.md) for more details. \ No newline at end of file diff --git a/components/backends/sglang/slurm_jobs/job_script_template.j2 b/components/backends/sglang/slurm_jobs/job_script_template.j2 index 84e0e33396..bbabbe8152 100755 --- a/components/backends/sglang/slurm_jobs/job_script_template.j2 +++ b/components/backends/sglang/slurm_jobs/job_script_template.j2 @@ -54,8 +54,7 @@ echo "Decode host IP address: $DECODE_HOST_IP" ENROOT_ARGS="\ --container-image=${CONTAINER_IMAGE} \ --no-container-entrypoint \ - --container-mount-home \ - --no-container-remap-root \ + --no-container-mount-home \ --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \ " diff --git a/components/backends/sglang/slurm_jobs/scripts/worker_setup.py b/components/backends/sglang/slurm_jobs/scripts/worker_setup.py index db6ac88531..08df3bfb67 100644 --- a/components/backends/sglang/slurm_jobs/scripts/worker_setup.py +++ b/components/backends/sglang/slurm_jobs/scripts/worker_setup.py @@ -206,7 +206,9 @@ def setup_prefill_node( if not etcd_process: raise RuntimeError("Failed to start etcd") - ingress_process = run_command("dynamo run in=http out=dyn", background=True) + ingress_process = run_command( + "python3 -m dynamo.frontend --http-port=8000 &", background=True + ) if not ingress_process: raise RuntimeError("Failed to start ingress") @@ -291,7 +293,7 @@ def setup_decode_node( "--deepep-mode low_latency " "--mem-fraction-static 0.835 " "--ep-num-redundant-experts 32 " - "--cuda-graph-bs 256 " + "--cuda-graph-bs 128 " ) return run_command(dynamo_cmd) diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md deleted file mode 100644 index 3a5b495dce..0000000000 --- a/components/backends/trtllm/README.md +++ /dev/null @@ -1,230 +0,0 @@ - - -# LLM Deployment using TensorRT-LLM - -This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using TensorRT-LLM. - -## Use the Latest Release - -We recommend using the latest stable release of dynamo to avoid breaking changes: - -[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) - -You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: - -```bash -git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -``` - ---- - -## Table of Contents -- [Feature Support Matrix](#feature-support-matrix) -- [Quick Start](#quick-start) -- [Single Node Examples](#single-node-examples) -- [Advanced Examples](#advanced-examples) -- [Disaggregation Strategy](#disaggregation-strategy) -- [KV Cache Transfer](#kv-cache-transfer-in-disaggregated-serving) -- [Client](#client) -- [Benchmarking](#benchmarking) - -## Feature Support Matrix - -### Core Dynamo Features - -| Feature | TensorRT-LLM | Notes | -|---------|--------------|-------| -| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | βœ… | | -| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | Not supported yet | -| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | βœ… | | -| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | 🚧 | Planned | -| [**Load Based Planner**](../../docs/architecture/load_planner.md) | 🚧 | Planned | -| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | 🚧 | Planned | - -### Large Scale P/D and WideEP Features - -| Feature | TensorRT-LLM | Notes | -|--------------------|--------------|-----------------------------------------------------------------------| -| **WideEP** | βœ… | | -| **DP Rank Routing**| βœ… | | -| **GB200 Support** | βœ… | | - -## Quick Start - -Below we provide a guide that lets you run all of our the common deployment patterns on a single node. - -### Start NATS and ETCD in the background - -Start using [Docker Compose](../../../deploy/docker-compose.yml) - -```bash -docker compose -f deploy/docker-compose.yml up -d -``` - -### Build container - -```bash -# TensorRT-LLM uses git-lfs, which needs to be installed in advance. -apt-get update && apt-get -y install git git-lfs - -# On an x86 machine: -./container/build.sh --framework tensorrtllm - -# On an ARM machine: -./container/build.sh --framework tensorrtllm --platform linux/arm64 - -# Build the container with the default experimental TensorRT-LLM commit -# WARNING: This is for experimental feature testing only. -# The container should not be used in a production environment. -./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit -``` - -### Run container - -```bash -./container/run.sh --framework tensorrtllm -it -``` - -## Single Node Examples - -> [!IMPORTANT] -> Below we provide some simple shell scripts that run the components for each configuration. Each shell script is simply running the `python3 -m dynamo.frontend ` to start up the ingress and using `python3 -m dynamo.trtllm ` to start up the workers. You can easily take each command and run them in separate terminals. - -This figure shows an overview of the major components to deploy: - -``` -+------+ +-----------+ +------------------+ +---------------+ -| HTTP |----->| processor |----->| Worker1 |------------>| Worker2 | -| |<-----| |<-----| |<------------| | -+------+ +-----------+ +------------------+ +---------------+ - | ^ | - query best | | return | publish kv events - worker | | worker_id v - | | +------------------+ - | +---------| kv-router | - +------------->| | - +------------------+ -``` - -**Note:** The diagram above shows all possible components in a deployment. Depending on the chosen disaggregation strategy, you can configure whether Worker1 handles prefill and Worker2 handles decode, or vice versa. For more information on how to select and configure these strategies, see the [Disaggregation Strategy](#disaggregation-strategy) section below. - -### Aggregated -```bash -cd $DYNAMO_HOME/components/backends/trtllm -./launch/agg.sh -``` - -### Aggregated with KV Routing -```bash -cd $DYNAMO_HOME/components/backends/trtllm -./launch/agg_router.sh -``` - -### Disaggregated - -> [!IMPORTANT] -> Disaggregated serving supports two strategies for request flow: `"prefill_first"` and `"decode_first"`. By default, the script below uses the `"decode_first"` strategy, which can reduce response latency by minimizing extra hops in the return path. You can switch strategies by setting the `DISAGGREGATION_STRATEGY` environment variable. - -```bash -cd $DYNAMO_HOME/components/backends/trtllm -./launch/disagg.sh -``` - -### Disaggregated with KV Routing - -> [!IMPORTANT] -> Disaggregated serving with KV routing uses a "prefill first" workflow by default. Currently, Dynamo supports KV routing to only one endpoint per model. In disaggregated workflow, it is generally more effective to route requests to the prefill worker. If you wish to use a "decode first" workflow instead, you can simply set the `DISAGGREGATION_STRATEGY` environment variable accordingly. - -```bash -cd $DYNAMO_HOME/components/backends/trtllm -./launch/disagg_router.sh -``` - -### Aggregated with Multi-Token Prediction (MTP) and DeepSeek R1 -```bash -cd $DYNAMO_HOME/components/backends/trtllm - -export AGG_ENGINE_ARGS=./engine_configs/deepseek_r1/mtp/mtp_agg.yaml -export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" -# nvidia/DeepSeek-R1-FP4 is a large model -export MODEL_PATH="nvidia/DeepSeek-R1-FP4" -./launch/agg.sh -``` - -Notes: -- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script. - - Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit` - -- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark. -- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates. - -## Advanced Examples - -Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example! - -### Multinode Deployment - -For comprehensive instructions on multinode serving, see the [multinode-examples.md](./multinode/multinode-examples.md) guide. It provides step-by-step deployment examples and configuration tips for running Dynamo with TensorRT-LLM across multiple nodes. While the walkthrough uses DeepSeek-R1 as the model, you can easily adapt the process for any supported model by updating the relevant configuration files. You can see [Llama4+eagle](./llama4_plus_eagle.md) guide to learn how to use these scripts when a single worker fits on the single node. - -### Speculative Decoding -- **[Llama 4 Maverick Instruct + Eagle Speculative Decoding](./llama4_plus_eagle.md)** - -## Disaggregation Strategy - -The disaggregation strategy controls how requests are distributed between the prefill and decode workers in a disaggregated deployment. - -By default, Dynamo uses a `decode first` strategy: incoming requests are initially routed to the decode worker, which then forwards them to the prefill worker in round-robin fashion. The prefill worker processes the request and returns results to the decode worker for any remaining decode operations. - -When using KV routing, however, Dynamo switches to a `prefill first` strategy. In this mode, requests are routed directly to the prefill worker, which can help maximize KV cache reuse and improve overall efficiency for certain workloads. Choosing the appropriate strategy can have a significant impact on performance, depending on your use case. - -The disaggregation strategy can be set using the `DISAGGREGATION_STRATEGY` environment variable. You can set the strategy before launching your deployment, for example: -```bash -DISAGGREGATION_STRATEGY="prefill_first" ./launch/disagg.sh -``` - -## KV Cache Transfer in Disaggregated Serving - -Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disaggregated serving: UCX (default) and NIXL (experimental). For detailed information and configuration instructions for each method, see the [KV cache transfer guide](./kv-cache-tranfer.md). - -## Request Migration - -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend. - -The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. - -For ongoing requests, there is a `--migration-limit` flag which can be set on the Backend that tells the Frontend how many times a request can be migrated to another Backend should there be a loss of connectivity to the current Backend. - -For example, -```bash -python3 -m dynamo.trtllm ... --migration-limit=3 -``` -indicates a request to this model may be migrated up to 3 times to another Backend, before failing the request, should the Frontend detects a connectivity issue to the current Backend. - -The migrated request will continue responding to the original request, allowing for a seamless transition between Backends, and a reduced overall request failure rate at the Frontend for enhanced user experience. - -## Client - -See [client](../llm/README.md#client) section to learn how to send request to the deployment. - -NOTE: To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `. - -## Benchmarking - -To benchmark your deployment with GenAI-Perf, see this utility script, configuring the -`model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh) diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md new file mode 120000 index 0000000000..a2fb560cd1 --- /dev/null +++ b/components/backends/trtllm/README.md @@ -0,0 +1 @@ +../../../docs/components/backends/trtllm/README.md \ No newline at end of file diff --git a/components/backends/trtllm/deploy/agg.yaml b/components/backends/trtllm/deploy/agg.yaml new file mode 100644 index 0000000000..3fe9ad54ac --- /dev/null +++ b/components/backends/trtllm/deploy/agg.yaml @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-agg +spec: + services: + Frontend: + dynamoNamespace: trtllm-agg + componentType: main + livenessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 10 + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000" + TRTLLMWorker: + envFromSecret: hf-token-secret + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + dynamoNamespace: trtllm-agg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + args: + - "python3" + - "-m" + - "dynamo.trtllm" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--extra-engine-args" + - "engine_configs/agg.yaml" diff --git a/components/backends/trtllm/deploy/agg_router.yaml b/components/backends/trtllm/deploy/agg_router.yaml new file mode 100644 index 0000000000..116693d90a --- /dev/null +++ b/components/backends/trtllm/deploy/agg_router.yaml @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-agg-router +spec: + services: + Frontend: + livenessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 5 + dynamoNamespace: trtllm-agg-router + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000 --router-mode kv" + TRTLLMWorker: + envFromSecret: hf-token-secret + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + dynamoNamespace: trtllm-agg-router + componentType: worker + replicas: 2 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + args: + - "python3" + - "-m" + - "dynamo.trtllm" + - "--model-path" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--served-model-name" + - "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + - "--extra-engine-args" + - "engine_configs/agg.yaml" + - "--publish-events-and-metrics" diff --git a/components/backends/trtllm/deploy/disagg.yaml b/components/backends/trtllm/deploy/disagg.yaml new file mode 100644 index 0000000000..be2eefcd51 --- /dev/null +++ b/components/backends/trtllm/deploy/disagg.yaml @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-disagg +spec: + services: + Frontend: + dynamoNamespace: trtllm-disagg + componentType: main + livenessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 10 + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000" + TRTLLMPrefillWorker: + dynamoNamespace: trtllm-disagg + envFromSecret: hf-token-secret + componentType: worker + replicas: 1 + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" + TRTLLMDecodeWorker: + dynamoNamespace: trtllm-disagg + envFromSecret: hf-token-secret + componentType: worker + replicas: 1 + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" diff --git a/components/backends/trtllm/deploy/disagg_router.yaml b/components/backends/trtllm/deploy/disagg_router.yaml new file mode 100644 index 0000000000..512138cbbf --- /dev/null +++ b/components/backends/trtllm/deploy/disagg_router.yaml @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-v1-disagg-router +spec: + services: + Frontend: + dynamoNamespace: trtllm-v1-disagg-router + componentType: main + livenessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""' + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 3 + failureThreshold: 10 + replicas: 1 + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000 --router-mode kv" + TRTLLMPrefillWorker: + dynamoNamespace: trtllm-v1-disagg-router + envFromSecret: hf-token-secret + componentType: worker + replicas: 2 + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics 2>&1 | tee /tmp/trtllm.log" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" + TRTLLMDecodeWorker: + dynamoNamespace: trtllm-v1-disagg-router + envFromSecret: hf-token-secret + componentType: worker + replicas: 1 + livenessProbe: + httpGet: + path: /live + port: 9090 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 60 + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first 2>&1 | tee /tmp/trtllm.log" + envs: + - name: DYN_SYSTEM_ENABLED + value: "true" + - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS + value: "[\"generate\"]" + - name: DYN_SYSTEM_PORT + value: "9090" diff --git a/components/backends/trtllm/engine_configs/agg.yaml b/components/backends/trtllm/engine_configs/agg.yaml index 02b5cd8463..d349a65756 100644 --- a/components/backends/trtllm/engine_configs/agg.yaml +++ b/components/backends/trtllm/engine_configs/agg.yaml @@ -28,4 +28,7 @@ kv_cache_config: # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -use_cuda_graph: true + + +cuda_graph_config: + max_batch_size: 16 \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/decode.yaml b/components/backends/trtllm/engine_configs/decode.yaml index 3460f6ff80..bafc26d450 100644 --- a/components/backends/trtllm/engine_configs/decode.yaml +++ b/components/backends/trtllm/engine_configs/decode.yaml @@ -16,11 +16,16 @@ tensor_parallel_size: 1 moe_expert_parallel_size: 1 enable_attention_dp: false max_num_tokens: 8192 -max_batch_size: 16 trust_remote_code: true backend: pytorch enable_chunked_prefill: true disable_overlap_scheduler: false -use_cuda_graph: true + +cuda_graph_config: + max_batch_size: 16 + kv_cache_config: free_gpu_memory_fraction: 0.95 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml index f0b5411221..25fae60abf 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml @@ -28,23 +28,24 @@ max_num_tokens: 8448 max_seq_len: 8448 kv_cache_config: free_gpu_memory_fraction: 0.30 + dtype: fp8 # Enable the MTP(Multi-Token Prediction) in the model engine speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true -kv_cache_dtype: fp8 diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml index ab48b2e78b..59b9aabe98 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml @@ -31,23 +31,27 @@ max_num_tokens: 512 max_seq_len: 8704 kv_cache_config: free_gpu_memory_fraction: 0.85 + dtype: fp8 # Enable the MTP(Multi-Token Prediction) in decode model engine speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml index ee6ee26a94..f44bcac141 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml @@ -27,11 +27,15 @@ max_num_tokens: 8192 max_seq_len: 8192 kv_cache_config: free_gpu_memory_fraction: 0.75 + dtype: fp8 + print_iter_log: true -kv_cache_dtype: fp8 disable_overlap_scheduler: true # Enable the MTP(Multi-Token Prediction) in the prefill model engine speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml index 29dddba56f..db2377a92a 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml @@ -31,24 +31,26 @@ kv_cache_config: # With dp attention enabled: large ISL at high concurrency may need # free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction: 0.30 + dtype: fp8 + # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -use_cuda_graph: true -cuda_graph_padding_enabled: true +cuda_graph_config: + enable_padding: true # NOTE: For larger max batch size, you may want to add larger cuda graph # batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true -kv_cache_dtype: fp8 diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml index 772b94b283..73e193c146 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml @@ -31,25 +31,30 @@ kv_cache_config: # With dp attention enabled: large ISL at high concurrency may need # free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction: 0.30 + dtype: fp8 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 disable_overlap_scheduler: false -use_cuda_graph: true -cuda_graph_padding_enabled: true -# NOTE: For larger max batch size, you may want to add larger cuda graph -# batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 + +cuda_graph_config: + enable_padding: true + # NOTE: For larger max batch size, you may want to + # add larger cuda graph batch sizes below to match. + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml index 6ae899a68a..3d6d4d3574 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml @@ -26,6 +26,7 @@ max_seq_len: 8192 kv_cache_config: free_gpu_memory_fraction: 0.75 + dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed @@ -33,5 +34,6 @@ kv_cache_config: # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 disable_overlap_scheduler: true print_iter_log: true -# NOTE: This dtype must match in both prefill/decode configs -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml index d697caacfa..844c4ffa72 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml @@ -10,18 +10,20 @@ enable_attention_dp: true max_batch_size: 256 max_num_tokens: 256 max_seq_len: 8448 + kv_cache_config: free_gpu_memory_fraction: 0.7 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -kv_cache_dtype: fp8 + dtype: fp8 + +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml index 4f2df0aa56..d32aab2dd3 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml @@ -3,14 +3,16 @@ backend: pytorch # WideEP related settings -moe_backend: WideEP -# moe_max_num_tokens will default to max_num_tokens if left unspecified. -# -# If you want to set this value explicitly, one recommendation is below: -# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size -# 4096 = 256 * 16 -# moe_max_num_tokens: 4096 -moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml +moe_config: + backend: WIDEEP + # moe_max_num_tokens will default to max_num_tokens if left unspecified. + # + # If you want to set this value explicitly, one recommendation is below: + # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size + # 4096 = 256 * 16 + # moe_max_num_tokens: 4096 + load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml + tensor_parallel_size: 16 moe_expert_parallel_size: 16 @@ -18,18 +20,20 @@ enable_attention_dp: true max_batch_size: 256 max_num_tokens: 256 max_seq_len: 8448 + kv_cache_config: - free_gpu_memory_fraction: 0.7 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -kv_cache_dtype: fp8 + free_gpu_memory_fraction: 0.3 + dtype: fp8 + +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml index a8d1854814..652cf82250 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml @@ -15,8 +15,9 @@ backend: pytorch # WideEP related settings -moe_backend: WideEP -moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml +moe_config: + backend: WIDEEP + load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 @@ -35,25 +36,31 @@ kv_cache_config: # With dp attention enabled: large ISL at high concurrency may need # free_gpu_memory_fraction low to have enough available memory. free_gpu_memory_fraction: 0.30 + dtype: fp8 + # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 disable_overlap_scheduler: false -use_cuda_graph: true -cuda_graph_padding_enabled: true -# NOTE: For larger max batch size, you may want to add larger cuda graph -# batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 +cuda_graph_config: + enable_padding: true + # NOTE: For larger max batch size, you may want to + # add larger cuda graph batch sizes below to match. + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + + print_iter_log: true -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml index 44e439e506..4f7aabe682 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml @@ -15,8 +15,9 @@ backend: pytorch # WideEP related settings -moe_backend: WideEP -moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml +moe_config: + backend: WIDEEP + load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 @@ -29,7 +30,8 @@ max_num_tokens: 8192 max_seq_len: 8192 kv_cache_config: - free_gpu_memory_fraction: 0.75 + free_gpu_memory_fraction: 0.3 + dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed @@ -37,5 +39,6 @@ kv_cache_config: # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 disable_overlap_scheduler: true print_iter_log: true -# NOTE: This dtype must match in both prefill/decode configs -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml index 1bed25ef27..297a01595e 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml @@ -21,31 +21,21 @@ max_batch_size: 256 # Will be investigated in the future with TRTLLM team. max_num_tokens: 1024 max_seq_len: 8448 -autotuner_enabled: false +enable_autotuner: false disable_overlap_scheduler: true # Enable Speculative Decoding in the model engine speculative_config: decoding_type: Eagle max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: false kv_cache_config: free_gpu_memory_fraction: 0.5 enable_block_reuse: false -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 + +cuda_graph_config: + max_batch_size: 8 + diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml index 4b595d2126..0b8d799bfb 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml @@ -21,30 +21,34 @@ max_num_tokens: 512 # 8704 = 8192 ISL + 512 OSL max_seq_len: 8704 disable_overlap_scheduler: true -autotuner_enabled: false +enable_autotuner: false # Enable Speculative Decoding in the model engine speculative_config: decoding_type: Eagle max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: false kv_cache_config: free_gpu_memory_fraction: 0.5 enable_block_reuse: false + dtype: fp8 + +cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 print_iter_log: true -kv_cache_dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml index 8442e478ba..b05181b226 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml @@ -20,17 +20,20 @@ max_batch_size: 1 max_num_tokens: 8192 max_seq_len: 8192 print_iter_log: true -kv_cache_dtype: fp8 disable_overlap_scheduler: true -autotuner_enabled: false +enable_autotuner: false # Enable Speculative Decoding in the model engine speculative_config: decoding_type: Eagle max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: false kv_cache_config: free_gpu_memory_fraction: 0.5 enable_block_reuse: false + dtype: fp8 + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml index 56ccf8d07d..cada38087c 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml @@ -24,7 +24,7 @@ disable_overlap_scheduler: true # disable_overlap_scheduler is having acc issue speculative_config: decoding_type: Eagle max_draft_len: 3 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 eagle3_one_model: true kv_cache_config: diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml index 556a1365f5..43f04e2715 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml @@ -26,7 +26,7 @@ disable_overlap_scheduler: true speculative_config: decoding_type: Eagle max_draft_len: 3 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 eagle3_one_model: True kv_cache_config: @@ -38,3 +38,6 @@ cuda_graph_config: max_batch_size: 256 print_iter_log: true + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml index a75d2a6219..1cfc62ab02 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml @@ -26,9 +26,12 @@ disable_overlap_scheduler: true speculative_config: decoding_type: Eagle max_draft_len: 3 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3 eagle3_one_model: True kv_cache_config: free_gpu_memory_fraction: 0.5 enable_block_reuse: false + +cache_transceiver_config: + backend: default diff --git a/components/backends/trtllm/engine_configs/prefill.yaml b/components/backends/trtllm/engine_configs/prefill.yaml index 5dee9e653d..265d1f9289 100644 --- a/components/backends/trtllm/engine_configs/prefill.yaml +++ b/components/backends/trtllm/engine_configs/prefill.yaml @@ -16,13 +16,15 @@ tensor_parallel_size: 1 moe_expert_parallel_size: 1 enable_attention_dp: false max_num_tokens: 8192 -max_batch_size: 16 trust_remote_code: true backend: pytorch enable_chunked_prefill: true # Overlap scheduler not currently supported in prefill only workers. disable_overlap_scheduler: true -use_cuda_graph: false - +cuda_graph_config: + max_batch_size: 16 kv_cache_config: free_gpu_memory_fraction: 0.95 + +cache_transceiver_config: + backend: default \ No newline at end of file diff --git a/components/backends/trtllm/src/dynamo/trtllm/main.py b/components/backends/trtllm/src/dynamo/trtllm/main.py index f6988fd34c..144f780849 100644 --- a/components/backends/trtllm/src/dynamo/trtllm/main.py +++ b/components/backends/trtllm/src/dynamo/trtllm/main.py @@ -101,8 +101,10 @@ async def init(runtime: DistributedRuntime, config: Config): kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE else: kv_cache_config = arg_map["kv_cache_config"] - if not kv_cache_config.event_buffer_max_size: - kv_cache_config.event_buffer_max_size = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE + if "event_buffer_max_size" not in kv_cache_config: + kv_cache_config[ + "event_buffer_max_size" + ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE arg_map["kv_cache_config"] = kv_cache_config # Only pytorch backend is supported for now to publish events and metrics. diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md index f20b9bb9d0..895d742112 100644 --- a/components/backends/vllm/README.md +++ b/components/backends/vllm/README.md @@ -35,19 +35,19 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) | Feature | vLLM | Notes | |---------|------|-------| -| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | βœ… | | -| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP | -| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | βœ… | | -| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | βœ… | | -| [**Load Based Planner**](../../docs/architecture/load_planner.md) | 🚧 | WIP | -| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | 🚧 | WIP | +| [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | βœ… | | +| [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP | +| [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | βœ… | | +| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | βœ… | | +| [**Load Based Planner**](../../../docs/architecture/load_planner.md) | 🚧 | WIP | +| [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | 🚧 | WIP | ### Large Scale P/D and WideEP Features | Feature | vLLM | Notes | |--------------------|------|-----------------------------------------------------------------------| | **WideEP** | βœ… | Support for PPLX / DeepEP not verified | -| **DP Rank Routing**| βœ… | Supported via external control of DP ranks | +| **Attention DP** | βœ… | Supported via external control of DP ranks | | **GB200 Support** | 🚧 | Container functional on main | ## Quick Start @@ -56,7 +56,7 @@ Below we provide a guide that lets you run all of our the common deployment patt ### Start NATS and ETCD in the background -Start using [Docker Compose](../../../deploy/docker-compose.yml) +Start using Docker Compose ```bash docker compose -f deploy/docker-compose.yml up -d @@ -152,73 +152,7 @@ Below we provide a selected list of advanced deployments. Please open up an issu ### Kubernetes Deployment -For Kubernetes deployment, YAML manifests are provided in the `deploy/` directory. These define DynamoGraphDeployment resources for various configurations: - -- `agg.yaml` - Aggregated serving -- `agg_router.yaml` - Aggregated serving with KV routing -- `disagg.yaml` - Disaggregated serving -- `disagg_router.yaml` - Disaggregated serving with KV routing -- `disagg_planner.yaml` - Disaggregated serving with [SLA Planner](../../../docs/architecture/sla_planner.md). See [SLA Planner Deployment Guide](../../../docs/guides/dynamo_deploy/sla_planner_deployment.md) for more details. - -#### Prerequisites - -- **Dynamo Cloud**: Follow the [Quickstart Guide](../../../docs/guides/dynamo_deploy/quickstart.md) to deploy Dynamo Cloud first. - -- **Container Images**: We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts). If you'd prefer to use your own registry, build and push your own image: - ```bash - ./container/build.sh --framework VLLM - # Tag and push to your container registry - # Update the image references in the YAML files - ``` - -- **Pre-Deployment Profiling (if Using SLA Planner)**: Follow the [pre-deployment profiling guide](../../../docs/architecture/pre_deployment_profiling.md) to run pre-deployment profiling. The results will be saved to the `profiling-pvc` PVC and queried by the SLA Planner. - -- **Port Forwarding**: After deployment, forward the frontend service to access the API: - ```bash - kubectl port-forward deployment/vllm-v1-disagg-frontend- 8080:8000 - ``` - -#### Deploy to Kubernetes - -Example with disagg: -Export the NAMESPACE you used in your Dynamo Cloud Installation. - -```bash -cd dynamo -cd components/backends/vllm/deploy -kubectl apply -f disagg.yaml -n $NAMESPACE -``` - -To change `DYN_LOG` level, edit the yaml file by adding - -```yaml -... -spec: - envs: - - name: DYN_LOG - value: "debug" # or other log levels - ... -``` - -### Testing the Deployment - -Send a test request to verify your deployment: - -```bash -curl localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Qwen/Qwen3-0.6B", - "messages": [ - { - "role": "user", - "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." - } - ], - "stream": false, - "max_tokens": 30 - }' -``` +For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [vLLM Kubernetes Deployment Guide](../../../docs/components/backends/vllm/deploy/README.md) ## Configuration @@ -235,7 +169,7 @@ The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html ## Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the Frontend and the Backend. +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py index b86649f06b..889405f6af 100644 --- a/components/backends/vllm/src/dynamo/vllm/args.py +++ b/components/backends/vllm/src/dynamo/vllm/args.py @@ -2,13 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 -import asyncio -import json import logging import os -import socket import sys -import time from typing import Optional from vllm.config import KVTransferConfig @@ -16,9 +12,20 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.utils import FlexibleArgumentParser +from .ports import ( + DEFAULT_DYNAMO_PORT_MAX, + DEFAULT_DYNAMO_PORT_MIN, + DynamoPortRange, + EtcdContext, + PortAllocationRequest, + PortMetadata, + allocate_and_reserve_port, + allocate_and_reserve_port_block, + get_host_ip, +) + logger = logging.getLogger(__name__) -# Only used if you run it manually from the command line DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate" DEFAULT_MODEL = "Qwen/Qwen3-0.6B" @@ -34,6 +41,7 @@ class Config: migration_limit: int = 0 kv_port: Optional[int] = None side_channel_port: Optional[int] = None + port_range: DynamoPortRange # mirror vLLM model: str @@ -64,6 +72,18 @@ def parse_args() -> Config: default=0, help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.", ) + parser.add_argument( + "--dynamo-port-min", + type=int, + default=DEFAULT_DYNAMO_PORT_MIN, + help=f"Minimum port number for Dynamo services (default: {DEFAULT_DYNAMO_PORT_MIN}). Must be in registered ports range (1024-49151).", + ) + parser.add_argument( + "--dynamo-port-max", + type=int, + default=DEFAULT_DYNAMO_PORT_MAX, + help=f"Maximum port number for Dynamo services (default: {DEFAULT_DYNAMO_PORT_MAX}). Must be in registered ports range (1024-49151).", + ) parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() @@ -110,6 +130,9 @@ def parse_args() -> Config: config.engine_args = engine_args config.is_prefill_worker = args.is_prefill_worker config.migration_limit = args.migration_limit + config.port_range = DynamoPortRange( + min=args.dynamo_port_min, max=args.dynamo_port_max + ) if config.engine_args.block_size is None: config.engine_args.block_size = 16 @@ -120,106 +143,66 @@ def parse_args() -> Config: return config -async def allocate_and_reserve_port( - namespace, - etcd_client, - worker_id: str, - reason: str, - max_attempts: int = 100, -) -> int: - """ - Get an OS-assigned port and atomically reserve it in ETCD. - Retries until successful or max_attempts reached. - - Args: - max_attempts: Maximum number of ports to try (default: 100) - - Raises: - RuntimeError: If unable to reserve a port within max_attempts - OSError: If unable to create sockets (system resource issues) - """ - - node_name = socket.gethostname() - try: - node_ip = socket.gethostbyname(node_name) - except socket.gaierror: - # If hostname cannot be resolved, fall back to localhost - logger.warning( - f"Hostname '{node_name}' cannot be resolved, falling back to '127.0.0.1'" - ) - node_ip = "127.0.0.1" - - for attempt in range(1, max_attempts + 1): - # Hold socket open just long enough to reserve in ETCD - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - sock.bind(("", 0)) - port = sock.getsockname()[1] - - # Reserve in ETCD while holding the socket - key = f"dyn://{namespace}/ports/{node_ip}/{port}" - value = { - "worker_id": worker_id, - "reason": reason, - "reserved_at": time.time(), - "pid": os.getpid(), - } - - try: - await etcd_client.kv_create( - key=key, - value=json.dumps(value).encode(), - lease_id=etcd_client.primary_lease_id(), - ) - logger.debug(f"Reserved OS-assigned port {port} for {worker_id}") - return port - - except Exception as e: - logger.debug( - f"Port {port} on {node_name} was already reserved (attempt {attempt}): {e}" - ) - - if attempt < max_attempts: - await asyncio.sleep(0.01) - - raise RuntimeError( - f"Failed to allocate and reserve a port after {max_attempts} attempts" - ) - - async def configure_ports_with_etcd(config: Config, etcd_client): """Configure all settings that require ETCD, including port allocation and vLLM overrides.""" - # First, allocate ports + etcd_context = EtcdContext(client=etcd_client, namespace=config.namespace) + dp_rank = config.engine_args.data_parallel_rank or 0 worker_id = f"vllm-{config.component}-dp{dp_rank}" # Allocate KV events port - kv_port = await allocate_and_reserve_port( - namespace=config.namespace, - etcd_client=etcd_client, - worker_id=f"{worker_id}", - reason="zmq_kv_event_port", + if config.engine_args.enable_prefix_caching: + kv_metadata = PortMetadata(worker_id=worker_id, reason="zmq_kv_event_port") + kv_port = await allocate_and_reserve_port( + etcd_context=etcd_context, + metadata=kv_metadata, + port_range=config.port_range, + ) + config.kv_port = kv_port + logger.info(f"Allocated ZMQ KV events port: {kv_port} (worker_id={worker_id})") + + # Allocate side channel ports + # https://github.com/vllm-project/vllm/blob/releases/v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py#L372 + # NIXL calculates ports as: base_port + (dp_rank * tp_size) + tp_rank + # For dp_rank, we need to reserve tp_size consecutive ports + tp_size = config.engine_args.tensor_parallel_size or 1 + + # The first port for this dp_rank will be at: base_port + (dp_rank * tp_size) + # We need to allocate tp_size consecutive ports starting from there + nixl_metadata = PortMetadata(worker_id=worker_id, reason="nixl_side_channel_port") + nixl_request = PortAllocationRequest( + etcd_context=etcd_context, + metadata=nixl_metadata, + port_range=config.port_range, + block_size=tp_size, ) + allocated_ports = await allocate_and_reserve_port_block(nixl_request) + first_port_for_dp_rank = allocated_ports[0] + + # Calculate the base port that NIXL expects + # base_port = first_port_for_dp_rank - (dp_rank * tp_size) + nixl_offset = dp_rank * tp_size + base_side_channel_port = first_port_for_dp_rank - nixl_offset + + if base_side_channel_port < 0: + raise ValueError( + f"NIXL base port calculation resulted in negative port: " + f"first_allocated_port={first_port_for_dp_rank}, offset={nixl_offset}, " + f"base_port={base_side_channel_port}. Current range: {config.port_range.min}-{config.port_range.max}. " + f"Consider using a higher port range." + ) - # Allocate side channel port - side_channel_port = await allocate_and_reserve_port( - namespace=config.namespace, - etcd_client=etcd_client, - worker_id=f"{worker_id}", - reason="nixl_side_channel_port", - ) + config.side_channel_port = base_side_channel_port - # Update config with allocated ports - config.kv_port = kv_port - config.side_channel_port = side_channel_port + logger.info( + f"Allocated NIXL side channel ports: base={base_side_channel_port}, " + f"allocated_ports={allocated_ports} (worker_id={worker_id}, dp_rank={dp_rank}, tp_size={tp_size})" + ) def overwrite_args(config): """Set vLLM defaults for Dynamo.""" - assert ( - config.kv_port is not None - ), "Must set the kv_port, use configure_ports_with_etcd" assert ( config.side_channel_port is not None ), "Must set the kv_port, use configure_ports_with_etcd" @@ -263,36 +246,6 @@ def overwrite_args(config): raise ValueError(f"{key} not found in AsyncEngineArgs from vLLM.") -def get_host_ip() -> str: - """Get the IP address of the host. - This is needed for the side channel to work in multi-node deployments. - """ - try: - host_name = socket.gethostname() - except socket.error as e: - logger.warning(f"Failed to get hostname: {e}, falling back to '127.0.0.1'") - return "127.0.0.1" - else: - try: - # Get the IP address of the hostname - this is needed for the side channel to work in multi-node deployments - host_ip = socket.gethostbyname(host_name) - # Test if the IP is actually usable by binding to it - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_socket: - test_socket.bind((host_ip, 0)) - return host_ip - except socket.gaierror as e: - logger.warning( - f"Hostname '{host_name}' cannot be resolved: {e}, falling back to '127.0.0.1'" - ) - return "127.0.0.1" - except socket.error as e: - # If hostname is not usable for binding, fall back to localhost - logger.warning( - f"Hostname '{host_name}' is not usable for binding: {e}, falling back to '127.0.0.1'" - ) - return "127.0.0.1" - - def set_side_channel_host_and_port(config: Config): """vLLM V1 NixlConnector creates a side channel to exchange metadata with other NIXL connectors. This sets the port number for the side channel. diff --git a/components/backends/vllm/src/dynamo/vllm/ports.py b/components/backends/vllm/src/dynamo/vllm/ports.py new file mode 100644 index 0000000000..19fdde7279 --- /dev/null +++ b/components/backends/vllm/src/dynamo/vllm/ports.py @@ -0,0 +1,290 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Port allocation and management utilities for Dynamo services.""" + +import asyncio +import json +import logging +import os +import random +import socket +import time +from contextlib import contextmanager +from dataclasses import dataclass, field + +from dynamo.runtime import EtcdKvCache + +logger = logging.getLogger(__name__) + +# Default port range in the registered ports section +DEFAULT_DYNAMO_PORT_MIN = 20000 +DEFAULT_DYNAMO_PORT_MAX = 30000 + + +@dataclass +class DynamoPortRange: + """Port range configuration for Dynamo services""" + + min: int + max: int + + def __post_init__(self): + if self.min < 1024 or self.max > 49151: + raise ValueError( + f"Port range {self.min}-{self.max} is outside registered ports range (1024-49151)" + ) + if self.min >= self.max: + raise ValueError( + f"Invalid port range: min ({self.min}) must be less than max ({self.max})" + ) + + +@dataclass +class EtcdContext: + """Context for ETCD operations""" + + client: EtcdKvCache # etcd client instance + namespace: str # Namespace for keys (used in key prefix) + + def make_port_key(self, port: int) -> str: + """Generate ETCD key for a port reservation""" + node_ip = get_host_ip() + return f"dyn://{self.namespace}/ports/{node_ip}/{port}" + + +@dataclass +class PortMetadata: + """Metadata to store with port reservations in ETCD""" + + worker_id: str # Worker identifier (e.g., "vllm-backend-dp0") + reason: str # Purpose of the port (e.g., "nixl_side_channel_port") + block_info: dict = field(default_factory=dict) # Optional block allocation info + + def to_etcd_value(self) -> dict: + """Convert to dictionary for ETCD storage""" + value = { + "worker_id": self.worker_id, + "reason": self.reason, + "reserved_at": time.time(), + "pid": os.getpid(), + } + if self.block_info: + value.update(self.block_info) + return value + + +@dataclass +class PortAllocationRequest: + """Parameters for port allocation""" + + etcd_context: EtcdContext + metadata: PortMetadata + port_range: DynamoPortRange + block_size: int = 1 + max_attempts: int = 100 + + +@contextmanager +def hold_ports(ports: int | list[int]): + """Context manager to hold port binding(s). + + Holds socket bindings to ensure exclusive access to ports during reservation. + Can handle a single port or multiple ports. + + Args: + ports: Single port number or list of port numbers to hold + """ + if isinstance(ports, int): + ports = [ports] + + sockets = [] + try: + for port in ports: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.bind(("", port)) + sockets.append(sock) + + yield + + finally: + for sock in sockets: + sock.close() + + +def check_port_available(port: int) -> bool: + """Check if a specific port is available for binding.""" + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("", port)) + return True + except OSError: + return False + + +async def reserve_port_in_etcd( + etcd_context: EtcdContext, + port: int, + metadata: PortMetadata, +) -> None: + """Reserve a single port in ETCD.""" + key = etcd_context.make_port_key(port) + value = metadata.to_etcd_value() + + await etcd_context.client.kv_create( + key=key, + value=json.dumps(value).encode(), + lease_id=etcd_context.client.primary_lease_id(), + ) + + +async def allocate_and_reserve_port_block(request: PortAllocationRequest) -> list[int]: + """ + Allocate a contiguous block of ports from the specified range and atomically reserve them in ETCD. + Returns a list of all allocated ports in order. + + This function uses a context manager to hold port bindings while reserving in ETCD, + preventing race conditions between multiple processes. + + Args: + request: PortAllocationRequest containing all allocation parameters + + Returns: + list[int]: List of all allocated ports in ascending order + + Raises: + RuntimeError: If unable to reserve a port block within max_attempts + OSError: If unable to create sockets (system resource issues) + """ + # Create a list of valid starting ports (must have room for the entire block) + max_start_port = request.port_range.max - request.block_size + 1 + if max_start_port < request.port_range.min: + raise ValueError( + f"Port range {request.port_range.min}-{request.port_range.max} is too small for block size {request.block_size}" + ) + + available_start_ports = list(range(request.port_range.min, max_start_port + 1)) + random.shuffle(available_start_ports) + + actual_max_attempts = min(len(available_start_ports), request.max_attempts) + + for attempt in range(1, actual_max_attempts + 1): + start_port = available_start_ports[attempt - 1] + ports_to_reserve = list(range(start_port, start_port + request.block_size)) + + try: + # Try to bind to all ports in the block atomically + with hold_ports(ports_to_reserve): + logger.debug( + f"Successfully bound to ports {ports_to_reserve}, now reserving in ETCD" + ) + + # We have exclusive access to these ports, now reserve them in ETCD + for i, port in enumerate(ports_to_reserve): + port_metadata = PortMetadata( + worker_id=f"{request.metadata.worker_id}-{i}" + if request.block_size > 1 + else request.metadata.worker_id, + reason=request.metadata.reason, + block_info={ + "block_index": i, + "block_size": request.block_size, + "block_start": start_port, + } + if request.block_size > 1 + else {}, + ) + + await reserve_port_in_etcd( + etcd_context=request.etcd_context, + port=port, + metadata=port_metadata, + ) + + logger.debug( + f"Reserved port block {ports_to_reserve} from range {request.port_range.min}-{request.port_range.max} " + f"for {request.metadata.worker_id} (block_size={request.block_size})" + ) + return ports_to_reserve + + except OSError as e: + logger.debug( + f"Failed to bind to port block starting at {start_port} (attempt {attempt}): {e}" + ) + except Exception as e: + logger.debug( + f"Failed to reserve port block starting at {start_port} in ETCD (attempt {attempt}): {e}" + ) + + if attempt < actual_max_attempts: + await asyncio.sleep(0.01) + + raise RuntimeError( + f"Failed to allocate and reserve a port block of size {request.block_size} from range " + f"{request.port_range.min}-{request.port_range.max} after {actual_max_attempts} attempts" + ) + + +async def allocate_and_reserve_port( + etcd_context: EtcdContext, + metadata: PortMetadata, + port_range: DynamoPortRange, + max_attempts: int = 100, +) -> int: + """ + Allocate a port from the specified range and atomically reserve it in ETCD. + This is a convenience wrapper around allocate_and_reserve_port_block with block_size=1. + + Args: + etcd_context: ETCD context for operations + metadata: Port metadata for ETCD storage + port_range: DynamoPortRange object specifying min and max ports to try + max_attempts: Maximum number of ports to try (default: 100) + + Returns: + int: The allocated port number + + Raises: + RuntimeError: If unable to reserve a port within max_attempts + OSError: If unable to create sockets (system resource issues) + """ + request = PortAllocationRequest( + etcd_context=etcd_context, + metadata=metadata, + port_range=port_range, + block_size=1, + max_attempts=max_attempts, + ) + allocated_ports = await allocate_and_reserve_port_block(request) + return allocated_ports[0] # Return the single allocated port + + +def get_host_ip() -> str: + """Get the IP address of the host. + This is needed for the side channel to work in multi-node deployments. + """ + try: + host_name = socket.gethostname() + except socket.error as e: + logger.warning(f"Failed to get hostname: {e}, falling back to '127.0.0.1'") + return "127.0.0.1" + else: + try: + # Get the IP address of the hostname - this is needed for the side channel to work in multi-node deployments + host_ip = socket.gethostbyname(host_name) + # Test if the IP is actually usable by binding to it + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_socket: + test_socket.bind((host_ip, 0)) + return host_ip + except socket.gaierror as e: + logger.warning( + f"Hostname '{host_name}' cannot be resolved: {e}, falling back to '127.0.0.1'" + ) + return "127.0.0.1" + except socket.error as e: + # If hostname is not usable for binding, fall back to localhost + logger.warning( + f"Hostname '{host_name}' is not usable for binding: {e}, falling back to '127.0.0.1'" + ) + return "127.0.0.1" diff --git a/container/Dockerfile b/container/Dockerfile new file mode 100644 index 0000000000..e2ba11a4ef --- /dev/null +++ b/container/Dockerfile @@ -0,0 +1,280 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" +# TODO OPS-612: NCCL will hang with 25.03, so use 25.01 for now +# Please check https://github.com/ai-dynamo/dynamo/pull/1065 +# for details and reproducer to manually test if the image +# can be updated to later versions. +ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" +ARG RELEASE_BUILD=false +ARG ENABLE_KVBM=false + +# Define general architecture ARGs for supporting both x86 and aarch64 builds. +# ARCH: Used for package suffixes (e.g., amd64, arm64) +# ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64) +# +# Default values are for x86/amd64: +# --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64 +# +# For arm64/aarch64, build with: +# --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64 +#TODO OPS-592: Leverage uname -m to determine ARCH instead of passing it as an arg +ARG ARCH=amd64 +ARG ARCH_ALT=x86_64 + + +################################## +########## Base Image ############ +################################## + +FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base + +# Redeclare ARCH and ARCH_ALT so they're available in this stage +ARG ARCH +ARG ARCH_ALT +ARG CARGO_BUILD_JOBS + +ARG NIXL_UCX_REF=v1.19.x +ARG NIXL_REF=0.4.1 + +# Environment variables for NIXL +ENV NIXL_SRC_DIR=/opt/nixl \ + NIXL_PREFIX=/opt/nvidia/nvda_nixl \ + NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \ + NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins + +USER root +ARG PYTHON_VERSION=3.12 + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Rust environment setup +ENV RUSTUP_HOME=/usr/local/rustup \ + CARGO_HOME=/usr/local/cargo \ + PATH=/usr/local/cargo/bin:$PATH \ + RUST_VERSION=1.87.0 + +WORKDIR /opt/dynamo + +# Define Rust target based on ARCH_ALT ARG +ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu + +# Install Rust using RUSTARCH derived from ARCH_ALT +RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \ + # TODO OPS-591: Add SHA check back based on RUSTARCH + chmod +x rustup-init && \ + ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \ + rm rustup-init && \ + chmod -R a+w $RUSTUP_HOME $CARGO_HOME + +RUN apt-get update -y \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + # NIXL build dependencies + autoconf \ + automake \ + cmake \ + git \ + libtool \ + meson \ + net-tools \ + ninja-build \ + pybind11-dev \ + # These headers are missing with the hpcx installer, required + # by UCX to find RDMA devices + ibverbs-providers \ + ibverbs-utils \ + libibumad-dev \ + libibverbs-dev \ + librdmacm-dev \ + libnuma-dev \ + rdma-core \ + # Rust build dependencies + clang \ + libclang-dev \ + protobuf-compiler \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Download external dependencies in parallel for better performance +ENV NATS_VERSION="v2.10.28" +RUN --mount=type=cache,target=/var/cache/apt \ + wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/${NATS_VERSION}/nats-server-${NATS_VERSION}-${ARCH}.deb && \ + dpkg -i nats-server-${NATS_VERSION}-${ARCH}.deb && rm nats-server-${NATS_VERSION}-${ARCH}.deb + +ENV ETCD_VERSION="v3.5.21" +RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \ + mkdir -p /usr/local/bin/etcd && \ + tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \ + rm /tmp/etcd.tar.gz +ENV PATH=/usr/local/bin/etcd/:$PATH + +### UCX EFA Setup ### +RUN rm -rf /opt/hpcx/ucx && \ + rm -rf /usr/local/ucx && \ + echo "Building UCX with reference $NIXL_UCX_REF" && \ + cd /usr/local/src && \ + git clone --depth 1 --branch $NIXL_UCX_REF https://github.com/openucx/ucx.git && \ + cd ucx && \ + ./autogen.sh && \ + ./configure \ + --prefix=/usr/local/ucx \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=/usr/local/cuda \ + --with-verbs \ + --with-efa \ + --with-dm \ + --with-gdrcopy=/usr/local \ + --enable-mt && \ + make -j$(nproc) && \ + make -j$(nproc) install-strip && \ + echo "/usr/local/ucx/lib" > /etc/ld.so.conf.d/ucx.conf && \ + echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \ + ldconfig && \ + cd /usr/local/src && \ + rm -rf ucx + +# UCX environment variables +ENV CPATH=/usr/include:$CPATH \ + PATH=/usr/bin:$PATH \ + PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH + +### NIXL SETUP ### +# Clone nixl source with shallow clone for faster download +RUN git clone --depth 1 --branch ${NIXL_REF} "https://github.com/ai-dynamo/nixl.git" ${NIXL_SRC_DIR} && \ + cd ${NIXL_SRC_DIR} && \ + if [ "$ARCH" = "arm64" ]; then \ + nixl_build_args="-Ddisable_gds_backend=true"; \ + else \ + nixl_build_args=""; \ + fi && \ + meson setup build/ --buildtype=release --prefix=$NIXL_PREFIX $nixl_build_args && \ + ninja -C build/ -j$(nproc) && \ + ninja -C build/ install && \ + echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \ + echo "$NIXL_PLUGIN_DIR" >> /etc/ld.so.conf.d/nixl.conf && \ + ldconfig + +# Install NIXL Python module +# TODO OPS-590: Move gds_path selection based on arch into NIXL build and re-enable gds backend for arm64 +RUN if [ "$ARCH" = "arm64" ]; then \ + cd ${NIXL_SRC_DIR} && uv build . --out-dir /opt/dynamo/wheelhouse/nixl \ + --config-settings=setup-args="-Ddisable_gds_backend=true"; \ + else \ + cd ${NIXL_SRC_DIR} && uv build . --out-dir /opt/dynamo/wheelhouse/nixl; \ + fi + +# Create virtual environment +RUN mkdir -p /opt/dynamo/venv && \ + uv venv /opt/dynamo/venv --python 3.12 + +# Activate virtual environment +ENV VIRTUAL_ENV=/opt/dynamo/venv \ + PATH="/opt/dynamo/venv/bin:${PATH}" + +# Install common and test dependencies +RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ + --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \ + uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt + +################################## +##### Wheel Build Image ########## +################################## + +# Redeclare ARCH_ALT ARG so it's available for interpolation in the FROM instruction +ARG ARCH_ALT + +FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder + +ARG CARGO_BUILD_JOBS +# Set CARGO_BUILD_JOBS to 16 if not provided +# This is to prevent cargo from building $(nproc) jobs in parallel, +# which might exceed the number of opened files limit. +ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} +# Use build arg RELEASE_BUILD = true to generate wheels for Python 3.10, 3.11 and 3.12. +ARG RELEASE_BUILD +# Use arg ENABLE_KVBM = true to turn on the block-manager feature +ARG ENABLE_KVBM + +WORKDIR /opt/dynamo + +RUN dnf update -y \ + && dnf install -y llvm-toolset protobuf-compiler python3.12-devel \ + && dnf clean all \ + && rm -rf /var/cache/dnf + +ENV RUSTUP_HOME=/usr/local/rustup \ + CARGO_HOME=/usr/local/cargo \ + CARGO_TARGET_DIR=/opt/dynamo/target \ + VIRTUAL_ENV=/opt/dynamo/venv \ + NIXL_PREFIX=/opt/nvidia/nvda_nixl + +COPY --from=base $RUSTUP_HOME $RUSTUP_HOME +COPY --from=base $CARGO_HOME $CARGO_HOME +COPY --from=base $NIXL_PREFIX $NIXL_PREFIX +COPY --from=base $VIRTUAL_ENV $VIRTUAL_ENV +ENV PATH=$CARGO_HOME/bin:$VIRTUAL_ENV/bin:$PATH + +# Copy configuration files first for better layer caching +COPY pyproject.toml README.md LICENSE Cargo.toml Cargo.lock rust-toolchain.toml /opt/dynamo/ + +# Copy source code +COPY lib/ /opt/dynamo/lib/ +COPY components/ /opt/dynamo/components/ + +# Build dynamo wheel +RUN uv build --wheel --out-dir /opt/dynamo/dist && \ + cd /opt/dynamo/lib/bindings/python && \ + uv pip install maturin[patchelf] && \ + if [ "$ENABLE_KVBM" = "true" ]; then \ + maturin build --release --features block-manager --out /opt/dynamo/dist; \ + else \ + maturin build --release --out /opt/dynamo/dist; \ + fi && \ + if [ "$RELEASE_BUILD" = "true" ]; then \ + # do not enable KVBM feature, ensure compatibility with lower glibc + uv run --python 3.11 maturin build --release --out /opt/dynamo/dist && \ + uv run --python 3.10 maturin build --release --out /opt/dynamo/dist; \ + fi + +############################################## +########## Dev entrypoint image ############## +############################################## +FROM base AS dev + +# Application environment variables +ENV DYNAMO_HOME=/opt/dynamo \ + CARGO_TARGET_DIR=/opt/dynamo/target \ + PYTHONPATH=/opt/dynamo:$PYTHONPATH + +WORKDIR /opt/dynamo + +COPY --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/ +COPY --from=wheel_builder $CARGO_TARGET_DIR $CARGO_TARGET_DIR + +# Copy Cargo cache to avoid re-downloading dependencies +COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME + +# Temporarily copy benchmarks folder for installation +COPY benchmarks/ /opt/dynamo/benchmarks/ + +# Install all python packages +RUN uv pip install \ + /opt/dynamo/wheelhouse/ai_dynamo_runtime*cp312*.whl \ + /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ + /opt/dynamo/wheelhouse/nixl/nixl*.whl \ + /opt/dynamo/benchmarks && \ + rm -rf /opt/dynamo/benchmarks + +# Copy launch banner +RUN --mount=type=bind,source=./container/launch_message.txt,target=/opt/dynamo/launch_message.txt \ + sed '/^#\s/d' /opt/dynamo/launch_message.txt > ~/.launch_screen && \ + echo "cat ~/.launch_screen" >> ~/.bashrc + +ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] +CMD [] diff --git a/container/Dockerfile.none b/container/Dockerfile.none deleted file mode 100644 index 6ae8e9b937..0000000000 --- a/container/Dockerfile.none +++ /dev/null @@ -1,58 +0,0 @@ -FROM ubuntu:24.04 AS dev - -# libclang-dev && git needed for llamacpp engine deps in dynamo-run build -RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -yq python3-dev python3-pip python3-venv libucx0 libclang-dev git - -COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ -RUN mkdir /opt/dynamo && \ - uv venv /opt/dynamo/venv --python 3.12 && \ - . /opt/dynamo/venv/bin/activate && \ - uv pip install pip - -ENV VIRTUAL_ENV=/opt/dynamo/venv -ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" - -# Rust build/dev dependencies -RUN apt update -y && \ - apt install --no-install-recommends -y \ - wget \ - build-essential \ - protobuf-compiler \ - cmake \ - libssl-dev \ - pkg-config - -ENV RUSTUP_HOME=/usr/local/rustup \ - CARGO_HOME=/usr/local/cargo \ - PATH=/usr/local/cargo/bin:$PATH \ - RUST_VERSION=1.87.0 \ - RUSTARCH=x86_64-unknown-linux-gnu - -RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \ - echo "a3339fb004c3d0bb9862ba0bce001861fe5cbde9c10d16591eb3f39ee6cd3e7f *rustup-init" | sha256sum -c - && \ - chmod +x rustup-init && \ - ./rustup-init -y --no-modify-path --profile default --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \ - rm rustup-init && \ - chmod -R a+w $RUSTUP_HOME $CARGO_HOME - - -WORKDIR /workspace -ENV DYNAMO_HOME=/workspace - -COPY . /workspace/ - -ARG CARGO_BUILD_JOBS - -ENV CARGO_TARGET_DIR=/workspace/target - -RUN cargo build --release --locked && \ - cargo doc --no-deps && \ - cp target/release/dynamo-run /usr/local/bin && \ - cp target/release/metrics /usr/local/bin && \ - cp target/release/mock_worker /usr/local/bin - - -RUN uv build --wheel --out-dir /workspace/dist && \ - uv pip install /workspace/dist/ai_dynamo*any.whl - diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang index 8557684096..a78ecb35e4 100644 --- a/container/Dockerfile.sglang +++ b/container/Dockerfile.sglang @@ -40,7 +40,7 @@ ARG ARCH ARG ARCH_ALT ARG NIXL_UCX_REF=v1.19.x -ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 +ARG NIXL_REF=0.4.1 ENV NIXL_SRC_DIR=/opt/nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl @@ -378,8 +378,6 @@ RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/la sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \ echo "cat ~/.launch_screen" >> ~/.bashrc -ENV PYTHONPATH=/workspace/dynamo/components/planner/src:/workspace/examples/sglang/utils:$PYTHONPATH - ######################################## ########## Development Image ########### ######################################## @@ -416,6 +414,9 @@ ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins COPY --from=base /usr/local/ucx /usr/local/ucx COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX +# Copy CUDA development tools (nvcc, headers, etc.) from base devel image +COPY --from=base /usr/local/cuda/ /usr/local/cuda/ + ENV LD_LIBRARY_PATH=\ $NIXL_LIB_DIR:\ $NIXL_PLUGIN_DIR:\ @@ -429,8 +430,9 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential python3-dev libnuma-dev \ - # Curl for polling various endpoints. + # jq and curl for polling various endpoints and health checks curl \ + jq \ # For debugging vim \ # Libraries required by UCX to find RDMA devices @@ -446,7 +448,20 @@ RUN apt-get update && \ COPY --from=ci_minimum /workspace/target/release/metrics /usr/local/bin/metrics COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/ COPY --from=base /workspace/wheels/nixl/*.whl wheelhouse/ -RUN uv pip install ai-dynamo[sglang] --find-links wheelhouse + +# Install flashinfer-python pre-release version separately, then install ai-dynamo with sglang support +RUN uv pip install "flashinfer-python==0.2.9rc2" --prerelease=allow && \ + uv pip install "ai-dynamo[sglang]" --find-links wheelhouse + +# Common dependencies +# TODO: Remove extra install and use pyproject.toml to define all dependencies +RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \ + uv pip install --requirement /tmp/requirements.txt + +# Install test dependencies +# TODO: Remove this once we have a functional CI image built on top of the runtime image +RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \ + uv pip install --requirement /tmp/requirements.txt # Copy launch banner RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \ @@ -466,7 +481,5 @@ RUN uv pip install /workspace/benchmarks # Copy attribution files COPY ATTRIBUTION* LICENSE /workspace/ -ENV PYTHONPATH=/workspace/examples/sglang/utils:$PYTHONPATH - ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] CMD [] diff --git a/container/Dockerfile.sglang-wideep b/container/Dockerfile.sglang-wideep index 0bbcb3af23..68cea2a559 100644 --- a/container/Dockerfile.sglang-wideep +++ b/container/Dockerfile.sglang-wideep @@ -71,7 +71,7 @@ RUN rm -rf /opt/hpcx/ucx && \ ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH -ARG NIXL_TAG=0.3.1 +ARG NIXL_TAG=0.4.1 RUN git clone https://github.com/ai-dynamo/nixl.git && cd nixl && git checkout ${NIXL_TAG} && pip install --break-system-packages . --config-settings=setup-args="-Ducx_path=/usr/local/ucx" WORKDIR /sgl-workspace @@ -121,7 +121,7 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} RUN cargo build --release RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../.. -RUN pip install --break-system-packages -e . +RUN pip install --break-system-packages . RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \ dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb @@ -152,6 +152,9 @@ RUN cmake --version RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ rapidjson-dev \ + # jq and curl for polling various endpoints and health checks + jq \ + curl \ zlib1g-dev RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \ diff --git a/container/Dockerfile.tensorrt_llm b/container/Dockerfile.tensorrt_llm index 4a6cd167bf..57ee44a0e6 100644 --- a/container/Dockerfile.tensorrt_llm +++ b/container/Dockerfile.tensorrt_llm @@ -45,7 +45,7 @@ ARG ARCH ARG ARCH_ALT ARG NIXL_UCX_REF=v1.19.x -ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 +ARG NIXL_REF=0.4.1 ENV NIXL_SRC_DIR=/opt/nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl @@ -150,6 +150,7 @@ COPY --from=trtllm_wheel . /trtllm_wheel/ # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel # because there might be mismatched versions of TensorRT between the NGC PyTorch # and the TRTLLM wheel. +# Locking triton version to 3.3.1 as 3.4.0 breaks tensorrt-llm 1.0.0rc4 RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \ pip uninstall -y tensorrt && \ if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \ @@ -157,14 +158,19 @@ RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \ WHEEL_FILE=$(find /trtllm_wheel -name "*.whl" | head -n 1); \ if [ -n "$WHEEL_FILE" ]; then \ pip install "$WHEEL_FILE"; \ + if [ "$ARCH" = "amd64" ]; then \ + pip install "triton==3.3.1"; \ + fi; \ else \ echo "No wheel file found in /trtllm_wheel directory."; \ exit 1; \ fi; \ else \ - # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI - pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" \ - "${TENSORRTLLM_PIP_WHEEL}" ; \ + # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI + pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \ + if [ "$ARCH" = "amd64" ]; then \ + pip install "triton==3.3.1"; \ + fi; \ fi # Install test dependencies @@ -367,12 +373,25 @@ CMD [] FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime +WORKDIR /workspace + ARG ARCH_ALT -WORKDIR /workspace ENV DYNAMO_HOME=/workspace ENV VIRTUAL_ENV=/opt/dynamo/venv ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" +ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl +ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu +ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins +ENV LD_LIBRARY_PATH=\ +$NIXL_LIB_DIR:\ +$NIXL_PLUGIN_DIR:\ +/usr/local/ucx/lib:\ +/usr/local/ucx/lib/ucx:\ +/opt/hpcx/ompi/lib:\ +$LD_LIBRARY_PATH +ENV PATH=/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH +ENV OPAL_PREFIX=/opt/hpcx/ompi # Install apt dependencies # openssh-client, openssh-server are needed for OpenMPI @@ -380,7 +399,8 @@ RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential \ python3-dev \ - # Curl for polling various endpoints. + # jq and curl for polling various endpoints and health checks + jq \ curl \ # For debugging vim \ @@ -466,21 +486,6 @@ COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn /usr/local/ COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info COPY --from=build /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/ -# Setup environment variables -ARG ARCH_ALT -ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl -ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu -ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins - -ENV LD_LIBRARY_PATH=\ -$NIXL_LIB_DIR:\ -$NIXL_PLUGIN_DIR:\ -/usr/local/ucx/lib:\ -/usr/local/ucx/lib/ucx:\ -/opt/hpcx/ompi/lib:\ -$LD_LIBRARY_PATH -ENV PATH=/opt/hpcx/ompi/bin:/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH -ENV OPAL_PREFIX=/opt/hpcx/ompi # Install TensorRT-LLM (same as in build stage) ARG HAS_TRTLLM_CONTEXT=0 @@ -489,16 +494,23 @@ ARG TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" # Copy Dynamo wheels into wheelhouse # Copy metrics binary from wheel_builder image, not part of ai-dynamo wheel -COPY --from=dev /workspace/wheels/nixl/*.whl wheelhouse/ -COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/ +COPY --from=dev /workspace/wheels/nixl/*.whl /workspace/wheelhouse/ +COPY --from=wheel_builder /workspace/dist/*.whl /workspace/wheelhouse/ COPY --from=dev /workspace/target/release/metrics /usr/local/bin/metrics # NOTE: If a package (tensorrt_llm) exists on both --index-url and --extra-index-url, # uv will prioritize the --extra-index-url, unless --index-strategy unsafe-best-match # is also specified. So set the configurable index as a --extra-index-url for prioritization. -RUN uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" \ - "${TENSORRTLLM_PIP_WHEEL}" && \ - uv pip install ai-dynamo nixl --find-links wheelhouse +# NOTE: locking triton version to 3.3.1 as 3.4.0 breaks tensorrt-llm 1.0.0rc4 +# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc4. This +# can be removed after https://github.com/NVIDIA/TensorRT-LLM/pull/6703 is merged +# we upgrade to a published pip wheel containing this change. +RUN uv pip install "cuda-python>=12,<13" && \ + uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" && \ + if [ "$ARCH" = "amd64" ]; then \ + pip install "triton==3.3.1"; \ + fi; \ + uv pip install /workspace/wheelhouse/ai_dynamo_runtime*cp312*.whl /workspace/wheelhouse/ai_dynamo*any.whl /workspace/wheelhouse/nixl*.whl # Setup TRTLLM environment variables, same as in dev image ENV TRTLLM_USE_UCX_KVCACHE=1 diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 1ad4d253e7..c50b6d3ee4 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -8,6 +8,7 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # can be updated to later versions. ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" ARG RELEASE_BUILD +ARG ENABLE_KVBM=false ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" @@ -17,7 +18,9 @@ ARG TORCH_BACKEND="cu128" # Match 0.10.0 vLLM release # https://github.com/vllm-project/vllm/releases/tag/v0.10.0 -ARG DEEPGEMM_REF="1876566" +# Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100: +# "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'" +ARG DEEPGEMM_REF="03d0be3" ARG FLASHINF_REF="v0.2.8rc1" # Define general architecture ARGs for supporting both x86 and aarch64 builds. @@ -81,7 +84,7 @@ RUN apt-get update -y && \ rm -rf /var/lib/apt/lists/* ARG NIXL_UCX_REF=v1.19.x -ARG NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 +ARG NIXL_REF=0.4.1 ENV NIXL_SRC_DIR=/opt/nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl @@ -197,7 +200,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ # Should be able to select how you want your build to go cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \ chmod +x /tmp/install_vllm.sh && \ - /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND; + /tmp/install_vllm.sh --no-editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND; ENV LD_LIBRARY_PATH=\ /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\ @@ -317,6 +320,8 @@ ARG CARGO_BUILD_JOBS ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} # Use build arg RELEASE_BUILD = true to generate wheels for Python 3.10, 3.11 and 3.12. ARG RELEASE_BUILD +# Use arg ENABLE_KVBM = true to turn on the block-manager feature +ARG ENABLE_KVBM # Keep in sync with the base image. ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl @@ -365,7 +370,11 @@ RUN cargo build \ RUN uv build --wheel --out-dir /workspace/dist && \ cd /workspace/lib/bindings/python && \ uv pip install maturin[patchelf] && \ - maturin build --release --features block-manager --out /workspace/dist && \ + if [ "$ENABLE_KVBM" = "true" ]; then \ + maturin build --release --features block-manager --out /workspace/dist; \ + else \ + maturin build --release --out /workspace/dist; \ + fi && \ if [ "$RELEASE_BUILD" = "true" ]; then \ # do not enable KVBM feature, ensure compatibility with lower glibc uv run --python 3.11 maturin build --release --out /workspace/dist && \ @@ -437,7 +446,8 @@ RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential \ python3-dev \ - # Curl for polling various endpoints. + # jq and curl for polling various endpoints and health checks + jq \ curl \ # For debugging vim \ diff --git a/container/build.sh b/container/build.sh index 03f79588c3..5fe7e47baa 100755 --- a/container/build.sh +++ b/container/build.sh @@ -88,7 +88,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" # TensorRT-LLM commit to use for building the trtllm wheel if not provided. # Important Note: This commit is not used in our CI pipeline. See the CI # variables to learn how to run a pipeline with a specific commit. -DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3" +DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="69e9f6d48944b2ae0124ff57aa59340aa4dfae15" TRTLLM_COMMIT="" TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" @@ -96,7 +96,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package. # Need to update the Dockerfile.tensorrt_llm to use the ai-dynamo[trtllm] package. -DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc0" +DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc4" TENSORRTLLM_PIP_WHEEL="" @@ -107,13 +107,13 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # can be updated to later versions. VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" -NONE_BASE_IMAGE="ubuntu" -NONE_BASE_IMAGE_TAG="24.04" +NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" +NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" SGLANG_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" -NIXL_REF=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4 +NIXL_REF=0.4.1 NIXL_UCX_EFA_REF=7ec95b95e524a87e81cac92f5ca8523e3966b16b NO_CACHE="" @@ -265,6 +265,9 @@ get_options() { --release-build) RELEASE_BUILD=true ;; + --enable-kvbm) + ENABLE_KVBM=true + ;; --make-efa) NIXL_UCX_REF=$NIXL_UCX_EFA_REF ;; @@ -369,6 +372,7 @@ show_help() { echo " [--build-context name=path to add build context]" echo " [--release-build perform a release build]" echo " [--make-efa Enables EFA support for NIXL]" + echo " [--enable-kvbm Enables KVBM support in Python 3.12]" echo " [--trtllm-use-nixl-kvcache-experimental Enables NIXL KVCACHE experimental support for TensorRT-LLM]" exit 0 } @@ -389,8 +393,6 @@ ARCH="amd64" if [[ "$PLATFORM" == *"linux/arm64"* ]]; then ARCH="arm64" BUILD_ARGS+=" --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64 " - # TEMP: Pin to nixl 0.3.1 for arm build, since 0.4.0 fails - NIXL_REF=3503658e71143b56f9d5b1b440d84a94b9c41af8 fi # Update DOCKERFILE if framework is VLLM @@ -399,7 +401,7 @@ if [[ $FRAMEWORK == "VLLM" ]]; then elif [[ $FRAMEWORK == "TENSORRTLLM" ]]; then DOCKERFILE=${SOURCE_DIR}/Dockerfile.tensorrt_llm elif [[ $FRAMEWORK == "NONE" ]]; then - DOCKERFILE=${SOURCE_DIR}/Dockerfile.none + DOCKERFILE=${SOURCE_DIR}/Dockerfile elif [[ $FRAMEWORK == "SGLANG" ]]; then DOCKERFILE=${SOURCE_DIR}/Dockerfile.sglang fi @@ -412,7 +414,6 @@ if [[ $TARGET == "local-dev" ]]; then fi # BUILD DEV IMAGE - BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1 --build-arg VERSION=$VERSION --build-arg PYTHON_PACKAGE_VERSION=$PYTHON_PACKAGE_VERSION" if [ -n "${GITHUB_TOKEN}" ]; then @@ -518,6 +519,11 @@ if [ ! -z ${RELEASE_BUILD} ]; then BUILD_ARGS+=" --build-arg RELEASE_BUILD=${RELEASE_BUILD} " fi +if [ ! -z ${ENABLE_KVBM} ]; then + echo "Enabling the KVBM in the ai-dynamo-runtime" + BUILD_ARGS+=" --build-arg ENABLE_KVBM=${ENABLE_KVBM} " +fi + if [ -n "${NIXL_UCX_REF}" ]; then BUILD_ARGS+=" --build-arg NIXL_UCX_REF=${NIXL_UCX_REF} " fi diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index b4a1fc9955..c47504d3ae 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -135,10 +135,20 @@ if [ "$ARCH" = "arm64" ]; then fi else echo "Installing vllm for AMD64 architecture" + + echo "Attempting to install pinned OpenAI version..." + if ! uv pip install openai==1.99.9; then + echo "Pinned versions failed" + exit 1 + fi + + uv pip install -r requirements/build.txt + export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/0.10.0/vllm-0.10.0-cp38-abi3-manylinux1_x86_64.whl + if [ "$EDITABLE" = "true" ]; then - VLLM_USE_PRECOMPILED=1 uv pip install -e . --torch-backend=$TORCH_BACKEND + uv pip install -e . --torch-backend=$TORCH_BACKEND else - VLLM_USE_PRECOMPILED=1 uv pip install . --torch-backend=$TORCH_BACKEND + uv pip install . --torch-backend=$TORCH_BACKEND fi fi @@ -164,14 +174,17 @@ python setup.py install # Install Flash Infer -if [ "$ARCH" = "arm64" ]; then - uv pip install flashinfer-python -else - cd $INSTALLATION_DIR - git clone https://github.com/flashinfer-ai/flashinfer.git --recursive - cd flashinfer - git checkout $FLASHINF_REF - python -m pip install -v . +cd $INSTALLATION_DIR +git clone https://github.com/flashinfer-ai/flashinfer.git --recursive +cd flashinfer +git checkout $FLASHINF_REF +uv pip install -v . + +if [ "$ARCH" = "amd64" ]; then + # NOTE: PyTorch 2.8.0 compatibility issue + # PyTorch 2.8.0 causes "RuntimeError: operator torchvision::nms does not exist" error. + # Temporarily pinning to PyTorch 2.7.1 until this compatibility issue is resolved. + uv pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128 fi echo "vllm installation completed successfully" diff --git a/deploy/cloud/README.md b/deploy/cloud/README.md index 0f4ad5635e..dfbb10f392 100644 --- a/deploy/cloud/README.md +++ b/deploy/cloud/README.md @@ -21,6 +21,6 @@ This directory contains the infrastructure components required for the Dynamo cl For detailed documentation on setting up and using the Dynamo Cloud Platform, please refer to: - [Dynamo Cloud Platform Guide](../../docs/guides/dynamo_deploy/dynamo_cloud.md) -- [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md) +- [Operator Deployment Guide](../../docs/guides/dynamo_deploy/dynamo_operator.md) -For a quick start example, see [examples/hello_world/README.md#deploying-to-kubernetes-using-dynamo-cloud-and-dynamo-deploy-cli](../../examples/hello_world/README.md#deploying-to-kubernetes-using-dynamo-cloud-and-dynamo-deploy-cli) \ No newline at end of file +For a quick start example, see [examples/runtime/hello_world/README.md#deployment-to-kubernetes](../../examples/runtime/hello_world/README.md#deployment-to-kubernetes) \ No newline at end of file diff --git a/deploy/cloud/helm/deploy.sh b/deploy/cloud/helm/deploy.sh index 1866be0481..e9b9225c81 100755 --- a/deploy/cloud/helm/deploy.sh +++ b/deploy/cloud/helm/deploy.sh @@ -49,7 +49,6 @@ export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}" export INGRESS_CLASS="${INGRESS_CLASS:=nginx}" export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}" export ENABLE_LWS="${ENABLE_LWS:=false}" -export ENABLE_GROVE="${ENABLE_GROVE:=false}" # Add command line options INTERACTIVE=false @@ -165,7 +164,7 @@ echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX" echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS" echo "INSTALL_CRDS: $INSTALL_CRDS" -envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${ENABLE_GROVE}' < dynamo-platform-values.yaml > generated-values.yaml +envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}' < dynamo-platform-values.yaml > generated-values.yaml echo "generated file contents:" cat generated-values.yaml diff --git a/deploy/cloud/helm/dynamo-platform-values.yaml b/deploy/cloud/helm/dynamo-platform-values.yaml index 269a5962c1..4ead5fd98b 100644 --- a/deploy/cloud/helm/dynamo-platform-values.yaml +++ b/deploy/cloud/helm/dynamo-platform-values.yaml @@ -24,7 +24,6 @@ dynamo-operator: dynamo: enableLWS: ${ENABLE_LWS} - enableGrove: ${ENABLE_GROVE} ingress: enabled: ${INGRESS_ENABLED} className: ${INGRESS_CLASS} diff --git a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml index bb570f2e78..6c6fe1abba 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml @@ -100,8 +100,8 @@ spec: {{- if .Values.dynamo.enableLWS }} - --enable-lws {{- end }} - {{- if .Values.dynamo.enableGrove }} - - --enable-grove + {{- if .Values.dynamo.groveTerminationDelay }} + - --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }} {{- end }} command: - /manager diff --git a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml index f245ce4b68..bf084e5a1b 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml @@ -116,7 +116,6 @@ rules: - patch - update - watch -{{- if .Values.dynamo.enableGrove }} - apiGroups: - grove.io resources: @@ -129,7 +128,6 @@ rules: - patch - update - watch -{{- end }} - apiGroups: - apps resources: diff --git a/deploy/cloud/helm/platform/components/operator/values.yaml b/deploy/cloud/helm/platform/components/operator/values.yaml index 086677fcb0..540d23a768 100644 --- a/deploy/cloud/helm/platform/components/operator/values.yaml +++ b/deploy/cloud/helm/platform/components/operator/values.yaml @@ -82,7 +82,7 @@ dynamo: annotations: {} enableLWS: false - enableGrove: false + groveTerminationDelay: 15m internalImages: debugger: python:3.12-slim diff --git a/deploy/cloud/helm/platform/values.yaml b/deploy/cloud/helm/platform/values.yaml index baec3d0e8b..c9b3b9924a 100644 --- a/deploy/cloud/helm/platform/values.yaml +++ b/deploy/cloud/helm/platform/values.yaml @@ -34,7 +34,7 @@ dynamo-operator: imagePullSecrets: [] dynamo: enableLWS: false - enableGrove: false + groveTerminationDelay: 15m internalImages: debugger: python:3.12-slim enableRestrictedSecurityContext: false diff --git a/deploy/cloud/operator/cmd/main.go b/deploy/cloud/operator/cmd/main.go index 845d59afb6..ac8a142caa 100644 --- a/deploy/cloud/operator/cmd/main.go +++ b/deploy/cloud/operator/cmd/main.go @@ -30,6 +30,7 @@ import ( // to ensure that exec-entrypoint and run can make use of them. clientv3 "go.etcd.io/etcd/client/v3" corev1 "k8s.io/api/core/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" _ "k8s.io/client-go/plugin/pkg/client/auth" @@ -50,6 +51,7 @@ import ( grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller" commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd" @@ -73,6 +75,10 @@ func init() { utilruntime.Must(volcanoscheme.AddToScheme(scheme)) utilruntime.Must(grovev1alpha1.AddToScheme(scheme)) + + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) + + utilruntime.Must(istioclientsetscheme.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -92,7 +98,7 @@ func main() { var ingressControllerTLSSecretName string var ingressHostSuffix string var enableLWS bool - var enableGrove bool + var groveTerminationDelay time.Duration flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, @@ -120,22 +126,23 @@ func main() { "The suffix to use for the ingress host") flag.BoolVar(&enableLWS, "enable-lws", false, "If set, enable leader worker set") - flag.BoolVar(&enableGrove, "enable-grove", false, - "If set, enable grove") + flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay, + "The termination delay for Grove PodGangSets") opts := zap.Options{ Development: true, } opts.BindFlags(flag.CommandLine) flag.Parse() - utilruntime.Must(istioclientsetscheme.AddToScheme(scheme)) - ctrlConfig := commonController.Config{ RestrictedNamespace: restrictedNamespace, EnableLWS: enableLWS, - EnableGrove: enableGrove, - EtcdAddress: etcdAddr, - NatsAddress: natsAddr, + Grove: commonController.GroveConfig{ + Enabled: false, // Will be set after Grove discovery + TerminationDelay: groveTerminationDelay, + }, + EtcdAddress: etcdAddr, + NatsAddress: natsAddr, IngressConfig: commonController.IngressConfig{ VirtualServiceGateway: istioVirtualServiceGateway, IngressControllerClassName: ingressControllerClassName, @@ -201,6 +208,11 @@ func main() { os.Exit(1) } + // Detect Grove availability using discovery client + setupLog.Info("Detecting Grove availability...") + groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr) + ctrlConfig.Grove.Enabled = groveEnabled + // Create etcd client cli, err := clientv3.New(clientv3.Config{ Endpoints: []string{etcdAddr}, diff --git a/deploy/cloud/operator/internal/consts/consts.go b/deploy/cloud/operator/internal/consts/consts.go index a744d45249..599bdf51f3 100644 --- a/deploy/cloud/operator/internal/consts/consts.go +++ b/deploy/cloud/operator/internal/consts/consts.go @@ -1,5 +1,7 @@ package consts +import "time" + const ( HPACPUDefaultAverageUtilization = 80 @@ -37,4 +39,6 @@ const ( PlannerServiceAccountName = "planner-serviceaccount" DefaultIngressSuffix = "local" + + DefaultGroveTerminationDelay = 15 * time.Minute ) diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index cc669be47e..36601fff97 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -144,7 +144,7 @@ type Resource interface { func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) { logger := log.FromContext(ctx) - if r.Config.EnableGrove { + if r.Config.Grove.Enabled { // check if explicit opt out of grove if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse { logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation") @@ -308,7 +308,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err GenericFunc: func(ge event.GenericEvent) bool { return true }, })). WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)) - if r.Config.EnableGrove { + if r.Config.Grove.Enabled { ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{ // ignore creation cause we don't want to be called again after we create the pod gang set CreateFunc: func(ce event.CreateEvent) bool { return false }, diff --git a/deploy/cloud/operator/internal/controller_common/predicate.go b/deploy/cloud/operator/internal/controller_common/predicate.go index 539fde2714..5ad7724cfb 100644 --- a/deploy/cloud/operator/internal/controller_common/predicate.go +++ b/deploy/cloud/operator/internal/controller_common/predicate.go @@ -20,18 +20,28 @@ package controller_common import ( "context" "strings" + "time" "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/client-go/discovery" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" ) +type GroveConfig struct { + // Enabled is automatically determined by checking if Grove CRDs are installed in the cluster + Enabled bool + // TerminationDelay configures the termination delay for Grove PodGangSets + TerminationDelay time.Duration +} + type Config struct { // Enable resources filtering, only the resources belonging to the given namespace will be handled. RestrictedNamespace string EnableLWS bool - EnableGrove bool + Grove GroveConfig EtcdAddress string NatsAddress string IngressConfig IngressConfig @@ -48,6 +58,43 @@ func (i *IngressConfig) UseVirtualService() bool { return i.VirtualServiceGateway != "" } +// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered +// This approach uses the discovery client which is simpler and more reliable +func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool { + logger := log.FromContext(ctx) + + // Use the discovery client to check if Grove API groups are available + cfg := mgr.GetConfig() + if cfg == nil { + logger.Info("Grove detection failed, no discovery client available") + return false + } + + // Try to create a discovery client + discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg) + if err != nil { + logger.Error(err, "Grove detection failed, could not create discovery client") + return false + } + + // Check if grove.io API group is available + apiGroups, err := discoveryClient.ServerGroups() + if err != nil { + logger.Error(err, "Grove detection failed, could not list server groups") + return false + } + + for _, group := range apiGroups.Groups { + if group.Name == "grove.io" { + logger.Info("Grove is available, grove.io API group found") + return true + } + } + + logger.Info("Grove not available, grove.io API group not found") + return false +} + func EphemeralDeploymentEventFilter(config Config) predicate.Predicate { return predicate.NewPredicateFuncs(func(o client.Object) bool { l := log.FromContext(context.Background()) diff --git a/deploy/cloud/operator/internal/dynamo/graph.go b/deploy/cloud/operator/internal/dynamo/graph.go index a0ba1a4ae0..807511fbb8 100644 --- a/deploy/cloud/operator/internal/dynamo/graph.go +++ b/deploy/cloud/operator/internal/dynamo/graph.go @@ -166,10 +166,16 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD labels[commonconsts.KubeLabelDynamoComponent] = componentName labels[commonconsts.KubeLabelDynamoNamespace] = dynamoNamespace if component.ComponentType == commonconsts.ComponentTypePlanner { + // ensure that the extraPodSpec is not nil if deployment.Spec.ExtraPodSpec == nil { deployment.Spec.ExtraPodSpec = &common.ExtraPodSpec{} } - deployment.Spec.ExtraPodSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName + // ensure that the embedded PodSpec struct is not nil + if deployment.Spec.ExtraPodSpec.PodSpec == nil { + deployment.Spec.ExtraPodSpec.PodSpec = &corev1.PodSpec{} + } + // finally set the service account name + deployment.Spec.ExtraPodSpec.PodSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName } if deployment.IsMainComponent() && defaultIngressSpec != nil && deployment.Spec.Ingress == nil { deployment.Spec.Ingress = defaultIngressSpec @@ -316,6 +322,9 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn gangSet.Name = dynamoDeployment.Name gangSet.Namespace = dynamoDeployment.Namespace gangSet.Spec.Replicas = 1 + if controllerConfig.Grove.TerminationDelay > 0 { + gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay} + } for componentName, component := range dynamoDeployment.Spec.Services { container := corev1.Container{ Name: "main", diff --git a/deploy/cloud/operator/internal/dynamo/graph_test.go b/deploy/cloud/operator/internal/dynamo/graph_test.go index bfa66c30d3..f6eda2e39e 100644 --- a/deploy/cloud/operator/internal/dynamo/graph_test.go +++ b/deploy/cloud/operator/internal/dynamo/graph_test.go @@ -23,6 +23,7 @@ import ( "reflect" "sort" "testing" + "time" grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common" @@ -1136,6 +1137,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) { controllerConfig: controller_common.Config{ EtcdAddress: "etcd-address", NatsAddress: "nats-address", + Grove: controller_common.GroveConfig{ + TerminationDelay: 15 * time.Minute, + }, }, dynamoDeployment: &v1alpha1.DynamoGraphDeployment{ ObjectMeta: metav1.ObjectMeta{ @@ -1272,6 +1276,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { Spec: grovev1alpha1.PodGangSetSpec{ Replicas: 1, Template: grovev1alpha1.PodGangSetTemplateSpec{ + TerminationDelay: &metav1.Duration{Duration: 15 * time.Minute}, Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{ { Name: "frontend", diff --git a/deploy/helm/README.md b/deploy/helm/README.md index 704a11e3db..b8631d1aa7 100644 --- a/deploy/helm/README.md +++ b/deploy/helm/README.md @@ -34,6 +34,7 @@ Here is how you would install a VLLM inference backend example. ```bash helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./components/backends/vllm/deploy/agg.yaml +``` ### Installation using Grove diff --git a/deploy/inference-gateway/README.md b/deploy/inference-gateway/README.md index 7787d57b64..405380e49c 100644 --- a/deploy/inference-gateway/README.md +++ b/deploy/inference-gateway/README.md @@ -18,8 +18,7 @@ Currently, this setup is only kgateway based Inference Gateway. 1. **Install Dynamo Platform** -[See Quickstart Guide](../../../docs/guides/dynamo_deploy/quickstart.md) to install Dynamo Cloud. - +[See Quickstart Guide](../../docs/guides/dynamo_deploy/quickstart.md) to install Dynamo Cloud. 2. **Deploy Inference Gateway** @@ -70,7 +69,17 @@ kubectl get gateway inference-gateway -n my-model # inference-gateway kgateway x.x.x.x True 1m ``` -3. **Install dynamo model and dynamo gaie helm chart** +3. **Deploy model** + +Follow the steps in [model deployment](../../docs/components/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../components/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace. + +Sample commands to deploy model: +```bash +cd /components/backends/vllm/deploy +kubectl apply -f agg.yaml -n my-model +``` + +4. **Install Dynamo GAIE helm chart** The Inference Gateway is configured through the `inference-gateway-resources.yaml` file. diff --git a/deploy/metrics/README.md b/deploy/metrics/README.md index ce3b8e6aef..e23a13263f 100644 --- a/deploy/metrics/README.md +++ b/deploy/metrics/README.md @@ -87,7 +87,7 @@ Grafana is pre-configured with: ## Required Files The following configuration files should be present in this directory: -- [docker-compose.yml](./docker-compose.yml): Defines the Prometheus and Grafana services +- [docker-compose.yml](../docker-compose.yml): Defines the Prometheus and Grafana services - [prometheus.yml](./prometheus.yml): Contains Prometheus scraping configuration - [grafana-datasources.yml](./grafana-datasources.yml): Contains Grafana datasource configuration - [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration diff --git a/docs/API/nixl_connect/README.md b/docs/API/nixl_connect/README.md index 741b943847..4cb5620923 100644 --- a/docs/API/nixl_connect/README.md +++ b/docs/API/nixl_connect/README.md @@ -64,88 +64,6 @@ sequenceDiagram RemoteWorker -->> LocalWorker: Notify completion (unblock awaiter) ``` -## Examples - -### Generic Example - -In the diagram below, Local creates a [`WritableOperation`](writable_operation.md) intended to receive data from Remote. -Local then sends metadata about the requested RDMA operation to Remote. -Remote then uses the metadata to create a [`WriteOperation`](write_operation.md) which will perform the GPU Direct RDMA memory transfer from Remote's GPU memory to Local's GPU memory. - -```mermaid ---- -title: Write Operation Between Two Workers ---- -flowchart LR - c1[Remote] --"3: .begin_write()"--- WriteOperation - WriteOperation e1@=="4: GPU Direct RDMA"==> WritableOperation - WritableOperation --"1: .create_writable()"--- c2[Local] - c2 e2@--"2: RDMA Metadata via HTTP"--> c1 - e1@{ animate: true; } - e2@{ animate: true; } -``` - -### Multimodal Example - -In the case of the [Dynamo Multimodal Disaggregated Example](../../examples/multimodal/README.md): - - 1. The HTTP frontend accepts a text prompt and a URL to an image. - - 2. The prompt and URL are then enqueued with the Processor before being dispatched to the first available Decode Worker. - - 3. Decode Worker then requests a Prefill Worker to provide key-value data for the LLM powering the Decode Worker. - - 4. Prefill Worker then requests that the image be processed and provided as embeddings by the Encode Worker. - - 5. Encode Worker acquires the image, processes it, performs inference on the image using a specialized vision model, and finally provides the embeddings to Prefill Worker. - - 6. Prefill Worker receives the embeddings from Encode Worker and generates a key-value cache (KV$) update for Decode Worker's LLM and writes the update directly to the GPU memory reserved for the data. - - 7. Finally, Decode Worker performs the requested inference. - -```mermaid ---- -title: Multimodal Disaggregated Workflow ---- -flowchart LR - p0[HTTP Frontend] i0@--"text prompt"-->p1[Processor] - p0 i1@--"url"-->p1 - p1 i2@--"prompt"-->dw[Decode Worker] - p1 i3@--"url"-->dw - dw i4@--"prompt"-->pw[Prefill Worker] - dw i5@--"url"-->pw - pw i6@--"url"-->ew[Encode Worker] - ew o0@=="image embeddings"==>pw - pw o1@=="kv_cache updates"==>dw - dw o2@--"inference results"-->p0 - - i0@{ animate: true; } - i1@{ animate: true; } - i2@{ animate: true; } - i3@{ animate: true; } - i4@{ animate: true; } - i5@{ animate: true; } - i6@{ animate: true; } - o0@{ animate: true; } - o1@{ animate: true; } - o2@{ animate: true; } -``` - -> [!Note] -> In this example, it is the data transfer between the Prefill Worker and the Encode Worker that utilizes the Dynamo NIXL Connect library. -> The KV Cache transfer between Decode Worker and Prefill Worker utilizes the NIXL base RDMA subsystem directly without using the Dynamo NIXL Connect library. - -#### Code Examples - -See [prefill_worker](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal/components/prefill_worker.py#L199) or [decode_worker](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal/components/decode_worker.py#L239) from our Multimodal example, -for how they coordinate directly with the Encode Worker by creating a [`WritableOperation`](writable_operation.md), -sending the operation's metadata via Dynamo's round-robin dispatcher, and awaiting the operation for completion before making use of the transferred data. - -See [encode_worker](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal/components/encode_worker.py#L190) from our Multimodal example, -for how the resulting embeddings are registered with the RDMA subsystem by creating a [`Descriptor`](descriptor.md), -a [`WriteOperation`](write_operation.md) is created using the metadata provided by the requesting worker, -and the worker awaits for the data transfer to complete for yielding a response. - ## Python Classes @@ -154,7 +72,6 @@ and the worker awaits for the data transfer to complete for yielding a response. - [Device](device.md) - [ReadOperation](read_operation.md) - [ReadableOperation](readable_operation.md) - - [SerializedRequest](serialized_request.md) - [WritableOperation](writable_operation.md) - [WriteOperation](write_operation.md) @@ -164,5 +81,4 @@ and the worker awaits for the data transfer to complete for yielding a response. - [NVIDIA Dynamo](https://developer.nvidia.com/dynamo) @ [GitHub](https://github.com/ai-dynamo/dynamo) - [NVIDIA Dynamo NIXL Connect](https://github.com/ai-dynamo/dynamo/tree/main/docs/runtime/nixl_connect) - [NVIDIA Inference Transfer Library (NIXL)](https://developer.nvidia.com/blog/introducing-nvidia-dynamo-a-low-latency-distributed-inference-framework-for-scaling-reasoning-ai-models/#nvidia_inference_transfer_library_nixl_low-latency_hardware-agnostic_communication%C2%A0) @ [GitHub](https://github.com/ai-dynamo/nixl) - - [Dynamo Multimodal Example](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal) - [NVIDIA GPU Direct](https://developer.nvidia.com/gpudirect) diff --git a/docs/API/nixl_connect/connector.md b/docs/API/nixl_connect/connector.md index 7b8b1fa611..99bc81fc5b 100644 --- a/docs/API/nixl_connect/connector.md +++ b/docs/API/nixl_connect/connector.md @@ -28,7 +28,7 @@ The connector provides two methods of moving data between workers: - Preparing local memory to be read by a remote worker. -In both cases, local memory is registered with the NIXL-based RDMA subsystem via the [`Descriptor`](#descriptor) class and provided to the connector. +In both cases, local memory is registered with the NIXL-based RDMA subsystem via the [`Descriptor`](descriptor.md) class and provided to the connector. The connector then configures the RDMA subsystem to expose the memory for the requested operation and returns an operation control object. The operation control object, either a [`ReadableOperation`](readable_operation.md) or a [`WritableOperation`](writable_operation.md), provides RDMA metadata ([RdmaMetadata](rdma_metadata.md)) via its `.metadata()` method, functionality to query the operation's current state, as well as the ability to cancel the operation prior to its completion. diff --git a/docs/architecture/dynamo_flow.md b/docs/architecture/dynamo_flow.md index 32146e1188..ce7187c52d 100644 --- a/docs/architecture/dynamo_flow.md +++ b/docs/architecture/dynamo_flow.md @@ -17,7 +17,7 @@ limitations under the License. # Dynamo Architecture Flow -This diagram shows the NVIDIA Dynamo disaggregated inference system as implemented in [examples/llm](https://github.com/ai-dynamo/dynamo/tree/main/examples/llm). Color-coded flows indicate different types of operations: +This diagram shows the NVIDIA Dynamo disaggregated inference system as implemented in [examples/llm](https://github.com/ai-dynamo/dynamo/tree/v0.3.2/examples/llm). Color-coded flows indicate different types of operations: ## πŸ”΅ Main Request Flow (Blue) The primary user journey through the system: @@ -67,7 +67,7 @@ Coordination and messaging support: ## Technical Implementation Details -### NIXL (NVIDIA Interchange Library): +### NIXL (NVIDIA Inference Xfer Library): - Enables high-speed GPU-to-GPU data transfers using NVLink/PCIe - Decode Worker publishes GPU metadata to ETCD for coordination - PrefillWorker loads metadata to establish direct communication channels diff --git a/docs/architecture/kv_cache_routing.md b/docs/architecture/kv_cache_routing.md index a78feef9f5..35e5095b59 100644 --- a/docs/architecture/kv_cache_routing.md +++ b/docs/architecture/kv_cache_routing.md @@ -21,7 +21,7 @@ The KV-aware routing arguments: - `--router-temperature`: Sets the temperature when randomly selecting workers to route to via softmax sampling on the router cost logits. Setting it to 0 recovers the deterministic behavior where the min logit is picked. -- `--use-kv-events`: Sets whether to listen to KV events for maintaining the global view of cached blocks. If true, then we use the `KvIndexer` to listen to the block creation and deletion events. If false, `ApproxKvIndexer`, which assumes the kv cache of historical prompts exists for fixed time durations (hard-coded to 120s), is used to predict the kv cache hit ratio in each engine. Set false if your backend engine does not emit KV events. +- `--kv-events`: Sets whether to listen to KV events for maintaining the global view of cached blocks. If true, then we use the `KvIndexer` to listen to the block creation and deletion events. If false, `ApproxKvIndexer`, which assumes the kv cache of historical prompts exists for fixed time durations (hard-coded to 120s), is used to predict the kv cache hit ratio in each engine. Set false if your backend engine does not emit KV events. ## Architecture diff --git a/docs/architecture/planner_intro.rst b/docs/architecture/planner_intro.rst index 07d91b1132..dfafe2af69 100644 --- a/docs/architecture/planner_intro.rst +++ b/docs/architecture/planner_intro.rst @@ -19,13 +19,13 @@ Planner The planner monitors the state of the system and adjusts workers to ensure that the system runs efficiently. -Currently, the planner can scale the number of vllm workers up and down based on the kv cache load and prefill queue size: +Currently, the planner can scale the number of vLLM workers up and down based on the kv cache load and prefill queue size: Key features include: * **Load-based scaling** that monitors KV cache utilization and prefill queue size to make scaling decisions * **SLA-based scaling** that uses predictive modeling and performance interpolation to proactively meet TTFT and ITL targets -* **Multi-backend support** for both local (Circus) and Kubernetes environments +* **Multi-backend support** for Kubernetes environments * **Graceful scaling** that ensures no requests are dropped during scale-down operations .. list-table:: @@ -50,9 +50,6 @@ Key features include: * - - ❌ - SGLang - * - - - ❌ - - llama.cpp * - **Serving Type** - βœ… - Aggregated diff --git a/docs/components/backends/llm/README.md b/docs/components/backends/llm/README.md deleted file mode 120000 index 615da9417b..0000000000 --- a/docs/components/backends/llm/README.md +++ /dev/null @@ -1 +0,0 @@ -../../../../components/backends/llm/README.md \ No newline at end of file diff --git a/docs/components/backends/sglang/README.md b/docs/components/backends/sglang/README.md new file mode 100644 index 0000000000..98d7dda0c6 --- /dev/null +++ b/docs/components/backends/sglang/README.md @@ -0,0 +1,184 @@ + + +# LLM Deployment using SGLang + +This directory contains an SGLang component for Dynamo and reference implementations for deploying Large Language Models (LLMs) in various configurations using SGLang. SGLang internally uses ZMQ to communicate between the ingress and the engine processes. For Dynamo, we leverage the runtime to communicate directly with the engine processes and handle ingress and pre/post processing on our end. + +## Use the Latest Release + +We recommend using the latest stable release of dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +--- + +## Table of Contents +- [Feature Support Matrix](#feature-support-matrix) +- [Quick Start](#quick-start) +- [Single Node Examples](#run-single-node-examples) +- [Multi-Node and Advanced Examples](#advanced-examples) +- [Deploy on SLURM or Kubernetes](#deployment) + +## Feature Support Matrix + +### Core Dynamo Features + +| Feature | SGLang | Notes | +|---------|--------|-------| +| [**Disaggregated Serving**](../../../architecture/disagg_serving.md) | βœ… | | +| [**Conditional Disaggregation**](../../../architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) | +| [**KV-Aware Routing**](../../../architecture/kv_cache_routing.md) | βœ… | | +| [**SLA-Based Planner**](../../../architecture/sla_planner.md) | ❌ | Planned | +| [**Load Based Planner**](../../../architecture/load_planner.md) | ❌ | Planned | +| [**KVBM**](../../../architecture/kvbm_architecture.md) | ❌ | Planned | + +### Large Scale P/D and WideEP Features + +| Feature | SGLang | Notes | +|---------------------|--------|--------------------------------------------------------------| +| **WideEP** | βœ… | Full support on H100s/GB200 | +| **DP Rank Routing** | 🚧 | Direct routing supported. Dynamo KV router does not router to DP worker | +| **GB200 Support** | βœ… | | + + +## Quick Start + +Below we provide a guide that lets you run all of our the common deployment patterns on a single node. +### Start NATS and ETCD in the background + +Start using Docker Compose + +```bash +docker compose -f deploy/docker-compose.yml up -d +``` + +### Build container + +```bash +# pull our pre-build sglang runtime container +docker pull nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.3.2 +# or build from source +./container/build.sh --framework sglang +``` + +### Run container + +```bash +./container/run.sh -it --framework sglang +``` + +## Run Single Node Examples + +> [!IMPORTANT] +> Each example corresponds to a simple bash script that runs the OpenAI compatible server, processor, and optional router (written in Rust) and LLM engine (written in Python) in a single terminal. You can easily take each command and run them in separate terminals. +> +> Additionally - because we use sglang's argument parser, you can pass in any argument that sglang supports to the worker! + + +### Aggregated Serving + +```bash +cd $DYNAMO_ROOT/components/backends/sglang +./launch/agg.sh +``` + +### Aggregated Serving with KV Routing + +> [!NOTE] +> The current implementation of `components/backends/sglang/src/dynamo/sglang/worker/main.py` publishes _placeholder_ engine metrics to keep the Dynamo KV-router happy. Real-time metrics will be surfaced directly from the SGLang engine once the following pull requests are merged: +> β€’ Dynamo: [ai-dynamo/dynamo #1465](https://github.com/ai-dynamo/dynamo/pull/1465) – _feat: receive kvmetrics from sglang scheduler_. +> +> After these are in, the TODOs in `main.py` will be resolved and the placeholder logic removed. + +```bash +cd $DYNAMO_ROOT/components/backends/sglang +./launch/agg_router.sh +``` + +### Disaggregated serving + +
+Under the hood: SGLang Load Balancer vs Dynamo Discovery + +SGLang uses a mini load balancer to route requests to handle disaggregated serving. The load balancer functions as follows: + +1. The load balancer receives a request from the client +2. A random `(prefill, decode)` pair is selected from the pool of available workers +3. Request is sent to both `prefill` and `decode` workers via asyncio tasks +4. Internally disaggregation is done from prefill -> decode + +Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead, we first route to a random prefill worker, select a random decode worker, and then send the request to both. Internally, SGLang's bootstrap server (which is a part of the `tokenizer_manager`) is used in conjuction with NIXL to handle the kv transfer. + +
+ +> [!IMPORTANT] +> Disaggregated serving in SGLang currently requires each worker to have the same tensor parallel size [unless you are using an MLA based model](https://github.com/sgl-project/sglang/pull/5922) + +```bash +cd $DYNAMO_ROOT/components/backends/sglang +./launch/disagg.sh +``` + +### Disaggregated Serving with Mixture-of-Experts (MoE) models and DP attention + +You can use this configuration to test out disaggregated serving with dp attention and expert parallelism on a single node before scaling to the full DeepSeek-R1 model across multiple nodes. + +```bash +# note this will require 4 GPUs +cd $DYNAMO_ROOT/components/backends/sglang +./launch/disagg_dp_attn.sh +``` + +## Request Migration + +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. + +The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. + +For ongoing requests, there is a `--migration-limit` flag which can be set on the Backend that tells the Frontend how many times a request can be migrated to another Backend should there be a loss of connectivity to the current Backend. + +For example, +```bash +python3 -m dynamo.sglang ... --migration-limit=3 +``` +indicates a request to this model may be migrated up to 3 times to another Backend, before failing the request, should the Frontend detects a connectivity issue to the current Backend. + +The migrated request will continue responding to the original request, allowing for a seamless transition between Backends, and a reduced overall request failure rate at the Frontend for enhanced user experience. + +## Advanced Examples + +Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example! + +### Run on multi-node +- **[Run a multi-node model](docs/multinode-examples.md)** + +### Large scale P/D disaggregation with WideEP +- **[Run DeepSeek-R1 on 104+ H100s](docs/dsr1-wideep-h100.md)** + +### Speculative Decoding +- **Deploying DeepSeek-R1 with MTP - coming soon!** + +### Structured Output and Tool Calling +- **Tool calling with Dynamo - coming soon!** + +### Supporting SGLang's native endpoints via Dynamo +- **[HTTP Server for native SGLang endpoints](docs/sgl-http-server.md)** + +## Deployment + +We currently provide deployment examples for Kubernetes and SLURM + +## Kubernetes +- **[Deploying Dynamo with SGLang on Kubernetes](deploy/README.md)** + +## SLURM +- **[Deploying Dynamo with SGLang on SLURM](slurm_jobs/README.md)** diff --git a/docs/components/backends/sglang/deploy/README.md b/docs/components/backends/sglang/deploy/README.md new file mode 100644 index 0000000000..86a0ed5515 --- /dev/null +++ b/docs/components/backends/sglang/deploy/README.md @@ -0,0 +1,162 @@ +# SGLang Kubernetes Deployment Configurations + +This directory contains Kubernetes Custom Resource Definition (CRD) templates for deploying SGLang inference graphs using the **DynamoGraphDeployment** resource. + +## Available Deployment Patterns + +### 1. **Aggregated Deployment** (`agg.yaml`) +Basic deployment pattern with frontend and a single decode worker. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server +- `SGLangDecodeWorker`: Single worker handling both prefill and decode + +### 2. **Aggregated Router Deployment** (`agg_router.yaml`) +Enhanced aggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server with router mode enabled (`--router-mode kv`) +- `SGLangDecodeWorker`: Single worker handling both prefill and decode + +### 3. **Disaggregated Deployment** (`disagg.yaml`)** +High-performance deployment with separated prefill and decode workers. + +**Architecture:** +- `Frontend`: HTTP API server coordinating between workers +- `SGLangDecodeWorker`: Specialized decode-only worker (`--disaggregation-mode decode`) +- `SGLangPrefillWorker`: Specialized prefill-only worker (`--disaggregation-mode prefill`) +- Communication via NIXL transfer backend (`--disaggregation-transfer-backend nixl`) + +## CRD Structure + +All templates use the **DynamoGraphDeployment** CRD: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: +spec: + services: + : + # Service configuration +``` + +### Key Configuration Options + +**Resource Management:** +```yaml +resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" +``` + +**Container Configuration:** +```yaml +extraPodSpec: + mainContainer: + image: my-registry/sglang-runtime:my-tag + workingDir: /workspace/components/backends/sglang + args: + - "python3" + - "-m" + - "dynamo.sglang.worker" + # Model-specific arguments +``` + +## Prerequisites + +Before using these templates, ensure you have: + +1. **Dynamo Cloud Platform installed** - See [Installing Dynamo Cloud](../../../../guides/dynamo_deploy/dynamo_cloud.md) +2. **Kubernetes cluster with GPU support** +3. **Container registry access** for SGLang runtime images +4. **HuggingFace token secret** (referenced as `envFromSecret: hf-token-secret`) + +## Usage + +### 1. Choose Your Template +Select the deployment pattern that matches your requirements: +- Use `agg.yaml` for development/testing +- Use `agg_router.yaml` for production with load balancing +- Use `disagg.yaml` for maximum performance + +### 2. Customize Configuration +Edit the template to match your environment: + +```yaml +# Update image registry and tag +image: your-registry/sglang-runtime:your-tag + +# Configure your model +args: + - "--model-path" + - "your-org/your-model" + - "--served-model-name" + - "your-org/your-model" +``` + +### 3. Deploy + +Use the following command to deploy the deployment file. + +First, create a secret for the HuggingFace token. +```bash +export HF_TOKEN=your_hf_token +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + +Then, deploy the model using the deployment file. + +```bash +export DEPLOYMENT_FILE=agg.yaml +kubectl apply -f $DEPLOYMENT_FILE -n ${NAMESPACE} +``` + +### 4. Using Custom Dynamo Frameworks Image for SGLang + +To use a custom dynamo frameworks image for SGLang, you can update the deployment file using yq: + +```bash +export DEPLOYMENT_FILE=agg.yaml +export FRAMEWORK_RUNTIME_IMAGE= + +yq '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE > $DEPLOYMENT_FILE.generated +kubectl apply -f $DEPLOYMENT_FILE.generated -n $NAMESPACE +``` + +## Model Configuration + +All templates use **DeepSeek-R1-Distill-Llama-8B** as the default model. But you can use any sglang argument and configuration. Key parameters: + +## Monitoring and Health + +- **Frontend health endpoint**: `http://:8000/health` +- **Liveness probes**: Check process health every 60s + +## Further Reading + +- **Deployment Guide**: [Creating Kubernetes Deployments](../../../../guides/dynamo_deploy/create_deployment.md) +- **Quickstart**: [Deployment Quickstart](../../../../guides/dynamo_deploy/quickstart.md) +- **Platform Setup**: [Dynamo Cloud Installation](../../../../guides/dynamo_deploy/dynamo_cloud.md) +- **Examples**: [Deployment Examples](../../../../examples/README.md) +- **Kubernetes CRDs**: [Custom Resources Documentation](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/) + +## Troubleshooting + +Common issues and solutions: + +1. **Pod fails to start**: Check image registry access and HuggingFace token secret +2. **GPU not allocated**: Verify cluster has GPU nodes and proper resource limits +3. **Health check failures**: Review model loading logs and increase `initialDelaySeconds` +4. **Out of memory**: Increase memory limits or reduce model batch size + +For additional support, refer to the [deployment troubleshooting guide](../../../../guides/dynamo_deploy/quickstart.md). diff --git a/docs/components/backends/sglang/docs/dsr1-wideep-h100.md b/docs/components/backends/sglang/docs/dsr1-wideep-h100.md new file mode 100644 index 0000000000..6cfcace10d --- /dev/null +++ b/docs/components/backends/sglang/docs/dsr1-wideep-h100.md @@ -0,0 +1,174 @@ + + +# Running DeepSeek-R1 Disaggregated with WideEP on H100s + +Dynamo supports SGLang's implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-05-05-large-scale-ep/) for more details. We provide a Dockerfile for this in `container/Dockerfile.sglang-deepep` and configurations to deploy this at scale. In this example, we will run 1 prefill worker on 4 H100 nodes and 1 decode worker on 9 H100 nodes (104 total GPUs). + +## Instructions + +1. Build the Dynamo container + +```bash +cd $DYNAMO_ROOT +docker build -f container/Dockerfile.sglang-wideep . -t dynamo-wideep --no-cache +``` + +2. You can run this container on each 8xH100 node using the following command. + +> [!IMPORTANT] +> We recommend downloading DeepSeek-R1 and then mounting it to the container. You can find the model [here](https://huggingface.co/deepseek-ai/DeepSeek-R1) + +```bash +docker run \ + --gpus all \ + -it \ + --rm \ + --network host \ + --volume /PATH_TO_DSR1_MODEL/:/model/ \ + --shm-size=10G \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + --cap-add CAP_SYS_PTRACE \ + --ipc host \ + dynamo-wideep:latest +``` + +In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory. + +3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. + +```bash +./components/backends/sglang/src/dynamo/sglang/utils/gen_env_vars.sh +``` + +4. Run the ingress and prefill worker + +```bash +# run ingress +python3 -m dynamo.frontend --http-port=8000 & +# optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below) +python3 -m dynamo.sglang.utils.sgl_http_server --ns dynamo & +# run prefill worker +python3 -m dynamo.sglang.worker \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --skip-tokenizer-init \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \ + --nnodes 4 \ + --node-rank 0 \ + --tp-size 32 \ + --dp-size 32 \ + --enable-dp-attention \ + --decode-log-interval 1 \ + --enable-deepep-moe \ + --page-size 1 \ + --trust-remote-code \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-radix-cache \ + --watchdog-timeout 1000000 \ + --enable-two-batch-overlap \ + --deepep-mode normal \ + --mem-fraction-static 0.85 \ + --deepep-config /configs/deepep.json \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm dynamic \ + --eplb-algorithm deepseek +``` + +On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3 + +5. Run the decode worker on the head decode node + +```bash +python3 -m dynamo.sglang.decode_worker \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --skip-tokenizer-init \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --dist-init-addr ${HEAD_DECODE_NODE_IP}:29500 \ + --nnodes 9 \ + --node-rank 0 \ + --tp-size 72 \ + --dp-size 72 \ + --enable-dp-attention \ + --decode-log-interval 1 \ + --enable-deepep-moe \ + --page-size 1 \ + --trust-remote-code \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-radix-cache \ + --watchdog-timeout 1000000 \ + --enable-two-batch-overlap \ + --deepep-mode low_latency \ + --mem-fraction-static 0.835 \ + --ep-num-redundant-experts 32 \ + --cuda-graph-bs 128 +``` + +On the other decode nodes (this example has 9 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, and 8 + +## Benchmarking + +In the official [blog post repro instructions](https://github.com/sgl-project/sglang/issues/6017), SGL uses batch inference to benchmark their prefill and decode workers. They do this by pretokenizing the ShareGPT dataset and then creating a batch of 8192 requests with ISL 4096 and OSL 5 (for prefill stress test) and a batch of 40000 with ISL 2000 and OSL 100 (for decode stress test). If you want to repro these benchmarks, you will need to add the following flags to the prefill and decode commands: + +prefill: + +```bash +... +--max-running-requests 8192 \ +--max-total-tokens 131072 \ +--context-length 8192 \ +--init-expert-location /configs/prefill_in4096.json \ +--chunked-prefill-size 524288 + +``` + +decode: + +```bash +... +--max-running-requests 18432 \ +--context-length 4500 \ +--init-expert-location /configs/decode_in2000out100.json +``` + +We currently provide 2 different ways to perform an end to end benchmark which includes using our OpenAI frontend and tokenization. We will continue to add better support for these sorts of large single batch workloads in the future. + +1. **GenAI Perf to benchmark end to end performance with 8k ISL 256 OSL** + We've found that 8k ISL 256 OSL provides a good baseline for measuring end to end disaggregated serving performance for DSR1. As WideEP allows for a higher throughput, we provide a script that runs this workload at high concurrencies. DeepGEMM kernels can sometimes take a while to warm up. We provide a short ramping warmup script that can be used. + +Example usage: + +```bash +# warmup +./utils/bench.sh HEAD_PREFILL_NODE_IP --type warmup +# if you ran the http server on the head prefill node, you can optionally flush the kv cache for all workers (similar to SGLangs benchmarking script) +curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache +# run benchmark +./utils/bench.sh HEAD_PREFILL_NODE_IP --type e2e +``` + +2. **GenAI Perf to benchmark completions with custom dataset** + We provide a script that generates a JSONL file of the ShareGPT dataset and then use GenAI Perf to benchmark the prefill and decode workers. We use ShareGPT in order to leverage the pre-existing EPLB distributions provided by the SGLang team. If you don't want to use ShareGPT - you can also use GenAIPerf's synthetic dataset setup But note you will have to use dynamic EPLB configurations or record your own as the `init-expert-location` provided by SGLang is tuned specifically for the ShareGPT dataset at a 4096 ISL and 5 OSL. + +Example usage: + +```bash +# generate data +python3 src/dynamo/sglang/utils/generate_bench_data.py --output data.jsonl --num-prompts 8192 --input-len 4096 --output-len 5 --model deepseek-ai/DeepSeek-R1 +# if you ran the http server on the head prefill node, you can optionally flush the kv cache for all workers (similar to SGLangs benchmarking script) +curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache +# run benchmark +./utils/bench.sh HEAD_PREFILL_NODE_IP --type custom_completions +``` diff --git a/docs/components/backends/sglang/docs/multinode-examples.md b/docs/components/backends/sglang/docs/multinode-examples.md deleted file mode 120000 index 9929f08b4a..0000000000 --- a/docs/components/backends/sglang/docs/multinode-examples.md +++ /dev/null @@ -1 +0,0 @@ -../../../../../components/backends/sglang/docs/multinode-examples.md \ No newline at end of file diff --git a/docs/components/backends/sglang/docs/multinode-examples.md b/docs/components/backends/sglang/docs/multinode-examples.md new file mode 100644 index 0000000000..d6ae5e32e0 --- /dev/null +++ b/docs/components/backends/sglang/docs/multinode-examples.md @@ -0,0 +1,116 @@ + + +# Multinode Examples + +## Multi-node sized models + +SGLang allows you to deploy multi-node sized models by adding in the `dist-init-addr`, `nnodes`, and `node-rank` arguments. Below we demonstrate and example of deploying DeepSeek R1 for disaggregated serving across 4 nodes. This example requires 4 nodes of 8xH100 GPUs. + +**Step 1**: Use the provided helper script to generate commands to start NATS/ETCD on your head prefill node. This script will also give you environment variables to export on each other node. You will need the IP addresses of your head prefill and head decode node to run this script. +```bash +./components/backends/sglang/src/dynamo/sglang/utils/gen_env_vars.sh +``` + +**Step 2**: Ensure that your configuration file has the required arguments. Here's an example configuration that runs prefill and the model in TP16: + +Node 1: Run HTTP ingress, processor, and 8 shards of the prefill worker +```bash +# run ingress +python3 -m dynamo.frontend --http-port=8000 & +# run prefill worker +python3 -m dynamo.sglang.worker \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --tp 16 \ + --dp-size 16 \ + --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \ + --nnodes 2 \ + --node-rank 0 \ + --enable-dp-attention \ + --trust-remote-code \ + --skip-tokenizer-init \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --mem-fraction-static 0.82 +``` + +Node 2: Run the remaining 8 shards of the prefill worker +```bash +python3 -m dynamo.sglang.worker \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --tp 16 \ + --dp-size 16 \ + --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \ + --nnodes 2 \ + --node-rank 1 \ + --enable-dp-attention \ + --trust-remote-code \ + --skip-tokenizer-init \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --mem-fraction-static 0.82 +``` + +Node 3: Run the first 8 shards of the decode worker +```bash +python3 -m dynamo.sglang.decode_worker \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --tp 16 \ + --dp-size 16 \ + --dist-init-addr ${HEAD_DECODE_NODE_IP}:29500 \ + --nnodes 2 \ + --node-rank 0 \ + --enable-dp-attention \ + --trust-remote-code \ + --skip-tokenizer-init \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --mem-fraction-static 0.82 +``` + +Node 4: Run the remaining 8 shards of the decode worker +```bash +python3 -m dynamo.sglang.decode_worker \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --tp 16 \ + --dp-size 16 \ + --dist-init-addr ${HEAD_DECODE_NODE_IP}:29500 \ + --nnodes 2 \ + --node-rank 1 \ + --enable-dp-attention \ + --trust-remote-code \ + --skip-tokenizer-init \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --mem-fraction-static 0.82 +``` + +**Step 3**: Run inference +SGLang typically requires a warmup period to ensure the DeepGEMM kernels are loaded. We recommend running a few warmup requests and ensuring that the DeepGEMM kernels load in. + +```bash +curl ${HEAD_PREFILL_NODE_IP}:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek-ai/DeepSeek-R1", + "messages": [ + { + "role": "user", + "content": "In the heart of the tennis world, where champions rise and fall with each Grand Slam, lies the legend of the Golden Racket of Wimbledon. Once wielded by the greatest players of antiquity, this mythical racket is said to bestow unparalleled precision, grace, and longevity upon its rightful owner. For centuries, it remained hidden, its location lost to all but the most dedicated scholars of the sport. You are Roger Federer, the Swiss maestro whose elegant play and sportsmanship have already cemented your place among the legends, but whose quest for perfection remains unquenched even as time marches on. Recent dreams have brought you visions of this ancient artifact, along with fragments of a map that seems to lead to its resting place. Your journey will take you through the hallowed grounds of tennis history, from the clay courts of Roland Garros to the hidden training grounds of forgotten champions, and finally to a secret chamber beneath Centre Court itself. Character Background: Develop a detailed background for Roger Federer in this quest. Describe his motivations for seeking the Golden Racket, his tennis skills and personal weaknesses, and any connections to the legends of the sport that came before him. Is he driven by a desire to extend his career, to secure his legacy as the greatest of all time, or perhaps by something more personal? What price might he be willing to pay to claim this artifact, and what challenges from rivals past and present might stand in his way?" + } + ], + "stream":false, + "max_tokens": 30 + }' +``` + diff --git a/docs/components/backends/sglang/docs/sgl-http-server.md b/docs/components/backends/sglang/docs/sgl-http-server.md new file mode 100644 index 0000000000..28e2b2400a --- /dev/null +++ b/docs/components/backends/sglang/docs/sgl-http-server.md @@ -0,0 +1,80 @@ + + +# Supporting SGLang's native endpoints via HTTP Server + +# Introduction + +The SGLang HTTP server provides a REST API interface for managing and monitoring SGLang components running in a dynamo distributed environment. It leverages dynamo's service discovery mechanism to automatically find and communicate with SGLang workers across the cluster. + +## Architecture Overview + +The HTTP server (`sgl_http_server.py`) is built on FastAPI and integrates with dynamo's `DistributedRuntime` to discover and interact with SGLang components. It uses the following discovery flow: + +1. **Service Discovery**: Queries dynamo's etcd instance to find components that expose specific endpoints +2. **Dynamic Targeting**: Automatically discovers all matching components across namespaces without requiring manual configuration +3. **Direct Communication**: Establishes direct connections to discovered component instances using dynamo's client infrastructure + +## Discovery Mechanism + +The server uses dynamo's hierarchical service discovery structure: + +- **DistributedRuntime**: Maintains connections to etcd (service discovery) and NATS (messaging) +- **Namespace**: Logical grouping of components (default: "dynamo") +- **Component**: Individual SGLang workers or services +- **Endpoint**: Specific functionality exposed by each component + +The discovery process queries etcd with the prefix `instances/` to find all registered components that expose the target endpoint. Components are identified by their namespace, component name, and endpoint, allowing the server to dynamically scale operations across multiple instances. + +## Supported Endpoints + +### Current Endpoints + +#### POST /flush_cache +Flushes the radix cache across all discovered SGLang components. + +**Behavior:** +- Discovers all components in the specified namespace that expose the `flush_cache` endpoint +- Sends flush requests to all instances of each discovered component +- Returns success/failure status with details about the operation + +**Response:** +```json +{ + "message": "Cache flush initiated", + "success": true +} +``` + +### Upcoming Endpoints + +The following endpoints will be supported in future releases: + +#### POST /start_expert_distribution_record +Begins recording expert distribution metrics across SGLang components. + +#### POST /stop_expert_distribution_record +Stops the expert distribution recording process. + +#### GET /dump_expert_distribution_record +Retrieves the collected expert distribution data. + +## Configuration + +The server accepts the following command-line arguments: + +- `--port`: HTTP server port (default: 9001) +- `--ns/--namespace`: Target dynamo namespace (default: "dynamo") +- `--comp/--component`: Specific component name to target (default: discover all) +- `--endpoint`: Endpoint name to discover (default: "flush_cache") + +## Usage + +Start the server: +```bash +python3 -m dynamo.sglang.utils.sgl_http_server --ns dynamo +``` + +The server will automatically discover all SGLang components in the specified namespace and provide HTTP endpoints for managing them. diff --git a/docs/components/backends/sglang/slurm_jobs/README.md b/docs/components/backends/sglang/slurm_jobs/README.md new file mode 100644 index 0000000000..7fa454f39c --- /dev/null +++ b/docs/components/backends/sglang/slurm_jobs/README.md @@ -0,0 +1,108 @@ +# Example: Deploy Multi-node SGLang with Dynamo on SLURM + +This folder implements the example of [SGLang DeepSeek-R1 Disaggregated with WideEP](../docs/dsr1-wideep-h100.md) on a SLURM cluster. + +## Overview + +The scripts in this folder set up multiple cluster nodes to run the [SGLang DeepSeek-R1 Disaggregated with WideEP](../docs/dsr1-wideep-h100.md) example, with separate nodes handling prefill and decode. +The node setup is done using Python job submission scripts with Jinja2 templates for flexible configuration. The setup also includes GPU utilization monitoring capabilities to track performance during benchmarks. + +## Scripts + +- **`submit_job_script.py`**: Main script for generating and submitting SLURM job scripts from templates +- **`job_script_template.j2`**: Jinja2 template for generating SLURM job scripts +- **`scripts/worker_setup.py`**: Worker script that handles the setup on each node +- **`scripts/monitor_gpu_utilization.sh`**: Script for monitoring GPU utilization during benchmarks + +## Logs Folder Structure + +Each SLURM job creates a unique log directory under `logs/` using the job ID. For example, job ID `3062824` creates the directory `logs/3062824/`. + +### Log File Structure + +``` +logs/ +β”œβ”€β”€ 3062824/ # Job ID directory +β”‚ β”œβ”€β”€ log.out # Main job output (node allocation, IP addresses, launch commands) +β”‚ β”œβ”€β”€ log.err # Main job errors +β”‚ β”œβ”€β”€ node0197_prefill.out # Prefill node stdout (node0197) +β”‚ β”œβ”€β”€ node0197_prefill.err # Prefill node stderr (node0197) +β”‚ β”œβ”€β”€ node0200_prefill.out # Prefill node stdout (node0200) +β”‚ β”œβ”€β”€ node0200_prefill.err # Prefill node stderr (node0200) +β”‚ β”œβ”€β”€ node0201_decode.out # Decode node stdout (node0201) +β”‚ β”œβ”€β”€ node0201_decode.err # Decode node stderr (node0201) +β”‚ β”œβ”€β”€ node0204_decode.out # Decode node stdout (node0204) +β”‚ β”œβ”€β”€ node0204_decode.err # Decode node stderr (node0204) +β”‚ β”œβ”€β”€ node0197_prefill_gpu_utilization.log # GPU utilization monitoring (node0197) +β”‚ β”œβ”€β”€ node0200_prefill_gpu_utilization.log # GPU utilization monitoring (node0200) +β”‚ β”œβ”€β”€ node0201_decode_gpu_utilization.log # GPU utilization monitoring (node0201) +β”‚ └── node0204_decode_gpu_utilization.log # GPU utilization monitoring (node0204) +β”œβ”€β”€ 3063137/ # Another job ID directory +β”œβ”€β”€ 3062689/ # Another job ID directory +└── ... +``` + +## Setup + +For simplicity of the example, we will make some assumptions about your SLURM cluster: +1. We assume you have access to a SLURM cluster with multiple GPU nodes + available. For functional testing, most setups should be fine. For performance + testing, you should aim to allocate groups of nodes that are performantly + inter-connected, such as those in an NVL72 setup. +2. We assume this SLURM cluster has the [Pyxis](https://github.com/NVIDIA/pyxis) + SPANK plugin setup. In particular, the `job_script_template.j2` template in this + example will use `srun` arguments like `--container-image`, + `--container-mounts`, and `--container-env` that are added to `srun` by Pyxis. + If your cluster supports similar container based plugins, you may be able to + modify the template to use that instead. +3. We assume you have already built a recent Dynamo+SGLang container image as + described [here](../docs/dsr1-wideep-h100.md#instructions). + This is the image that can be passed to the `--container-image` argument in later steps. + +## Usage + +1. **Submit a benchmark job**: + ```bash + python submit_job_script.py \ + --template job_script_template.j2 \ + --model-dir /path/to/model \ + --config-dir /path/to/configs \ + --container-image container-image-uri \ + --account your-slurm-account + ``` + + **Required arguments**: + - `--template`: Path to Jinja2 template file + - `--model-dir`: Model directory path + - `--config-dir`: Config directory path + - `--container-image`: Container image URI (e.g., `registry/repository:tag`) + - `--account`: SLURM account + + **Optional arguments**: + - `--prefill-nodes`: Number of prefill nodes (default: `2`) + - `--decode-nodes`: Number of decode nodes (default: `2`) + - `--gpus-per-node`: Number of GPUs per node (default: `8`) + - `--network-interface`: Network interface to use (default: `eth3`) + - `--job-name`: SLURM job name (default: `dynamo_setup`) + - `--time-limit`: Time limit in HH:MM:SS format (default: `01:00:00`) + + **Note**: The script automatically calculates the total number of nodes needed based on `--prefill-nodes` and `--decode-nodes` parameters. + +2. **Monitor job progress**: + ```bash + squeue -u $USER + ``` + +3. **Check logs in real-time**: + ```bash + tail -f logs/{JOB_ID}/log.out + ``` + +4. **Monitor GPU utilization**: + ```bash + tail -f logs/{JOB_ID}/{node}_prefill_gpu_utilization.log + ``` + +## Outputs + +Benchmark results and outputs are stored in the `outputs/` directory, which is mounted into the container. diff --git a/docs/components/backends/trtllm/README.md b/docs/components/backends/trtllm/README.md deleted file mode 120000 index 15969304d0..0000000000 --- a/docs/components/backends/trtllm/README.md +++ /dev/null @@ -1 +0,0 @@ -../../../../components/backends/trtllm/README.md \ No newline at end of file diff --git a/docs/components/backends/trtllm/README.md b/docs/components/backends/trtllm/README.md new file mode 100644 index 0000000000..c629c20669 --- /dev/null +++ b/docs/components/backends/trtllm/README.md @@ -0,0 +1,239 @@ + + +# LLM Deployment using TensorRT-LLM + +This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using TensorRT-LLM. + +## Use the Latest Release + +We recommend using the latest stable release of dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +--- + +## Table of Contents +- [Feature Support Matrix](#feature-support-matrix) +- [Quick Start](#quick-start) +- [Single Node Examples](#single-node-examples) +- [Advanced Examples](#advanced-examples) +- [Disaggregation Strategy](#disaggregation-strategy) +- [KV Cache Transfer](#kv-cache-transfer-in-disaggregated-serving) +- [Client](#client) +- [Benchmarking](#benchmarking) + +## Feature Support Matrix + +### Core Dynamo Features + +| Feature | TensorRT-LLM | Notes | +|---------|--------------|-------| +| [**Disaggregated Serving**](../../../architecture/disagg_serving.md) | βœ… | | +| [**Conditional Disaggregation**](../../../architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | Not supported yet | +| [**KV-Aware Routing**](../../../architecture/kv_cache_routing.md) | βœ… | | +| [**SLA-Based Planner**](../../../architecture/sla_planner.md) | 🚧 | Planned | +| [**Load Based Planner**](../../../architecture/load_planner.md) | 🚧 | Planned | +| [**KVBM**](../../../architecture/kvbm_architecture.md) | 🚧 | Planned | + +### Large Scale P/D and WideEP Features + +| Feature | TensorRT-LLM | Notes | +|--------------------|--------------|-----------------------------------------------------------------------| +| **WideEP** | βœ… | | +| **Attention DP** | βœ… | | +| **GB200 Support** | βœ… | | + +## Quick Start + +Below we provide a guide that lets you run all of our the common deployment patterns on a single node. + +### Start NATS and ETCD in the background + +Start using Docker Compose + +```bash +docker compose -f deploy/docker-compose.yml up -d +``` + +### Build container + +```bash +# TensorRT-LLM uses git-lfs, which needs to be installed in advance. +apt-get update && apt-get -y install git git-lfs + +# On an x86 machine: +./container/build.sh --framework tensorrtllm + +# On an ARM machine: +./container/build.sh --framework tensorrtllm --platform linux/arm64 + +# Build the container with the default experimental TensorRT-LLM commit +# WARNING: This is for experimental feature testing only. +# The container should not be used in a production environment. +./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit +``` + +### Run container + +```bash +./container/run.sh --framework tensorrtllm -it +``` + +## Single Node Examples + +> [!IMPORTANT] +> Below we provide some simple shell scripts that run the components for each configuration. Each shell script is simply running the `python3 -m dynamo.frontend ` to start up the ingress and using `python3 -m dynamo.trtllm ` to start up the workers. You can easily take each command and run them in separate terminals. + +This figure shows an overview of the major components to deploy: + +``` ++------+ +-----------+ +------------------+ +---------------+ +| HTTP |----->| processor |----->| Worker1 |------------>| Worker2 | +| |<-----| |<-----| |<------------| | ++------+ +-----------+ +------------------+ +---------------+ + | ^ | + query best | | return | publish kv events + worker | | worker_id v + | | +------------------+ + | +---------| kv-router | + +------------->| | + +------------------+ +``` + +**Note:** The diagram above shows all possible components in a deployment. Depending on the chosen disaggregation strategy, you can configure whether Worker1 handles prefill and Worker2 handles decode, or vice versa. For more information on how to select and configure these strategies, see the [Disaggregation Strategy](#disaggregation-strategy) section below. + +### Aggregated +```bash +cd $DYNAMO_HOME/components/backends/trtllm +./launch/agg.sh +``` + +### Aggregated with KV Routing +```bash +cd $DYNAMO_HOME/components/backends/trtllm +./launch/agg_router.sh +``` + +### Disaggregated + +> [!IMPORTANT] +> Disaggregated serving supports two strategies for request flow: `"prefill_first"` and `"decode_first"`. By default, the script below uses the `"decode_first"` strategy, which can reduce response latency by minimizing extra hops in the return path. You can switch strategies by setting the `DISAGGREGATION_STRATEGY` environment variable. + +```bash +cd $DYNAMO_HOME/components/backends/trtllm +./launch/disagg.sh +``` + +### Disaggregated with KV Routing + +> [!IMPORTANT] +> Disaggregated serving with KV routing uses a "prefill first" workflow by default. Currently, Dynamo supports KV routing to only one endpoint per model. In disaggregated workflow, it is generally more effective to route requests to the prefill worker. If you wish to use a "decode first" workflow instead, you can simply set the `DISAGGREGATION_STRATEGY` environment variable accordingly. + +```bash +cd $DYNAMO_HOME/components/backends/trtllm +./launch/disagg_router.sh +``` + +### Aggregated with Multi-Token Prediction (MTP) and DeepSeek R1 +```bash +cd $DYNAMO_HOME/components/backends/trtllm + +export AGG_ENGINE_ARGS=./engine_configs/deepseek_r1/mtp/mtp_agg.yaml +export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" +# nvidia/DeepSeek-R1-FP4 is a large model +export MODEL_PATH="nvidia/DeepSeek-R1-FP4" +./launch/agg.sh +``` + +Notes: +- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script. + + Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit` + +- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark. +- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates. + +## Advanced Examples + +Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example! + +### Multinode Deployment + +For comprehensive instructions on multinode serving, see the [multinode-examples.md](multinode-examples.md) guide. It provides step-by-step deployment examples and configuration tips for running Dynamo with TensorRT-LLM across multiple nodes. While the walkthrough uses DeepSeek-R1 as the model, you can easily adapt the process for any supported model by updating the relevant configuration files. You can see [Llama4+eagle](llama4_plus_eagle.md) guide to learn how to use these scripts when a single worker fits on the single node. + +### Speculative Decoding +- **[Llama 4 Maverick Instruct + Eagle Speculative Decoding](llama4_plus_eagle.md)** + +### Kubernetes Deployment + +For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [TensorRT-LLM Kubernetes Deployment Guide](deploy/README.md) + +### Client + +To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `. + +### Benchmarking + +To benchmark your deployment with GenAI-Perf, see this utility script, configuring the +`model` name and `host` based on your deployment: +```bash +{REPO_ROOT}/benchmarks/llm/perf.sh +``` + +## Disaggregation Strategy + +The disaggregation strategy controls how requests are distributed between the prefill and decode workers in a disaggregated deployment. + +By default, Dynamo uses a `decode first` strategy: incoming requests are initially routed to the decode worker, which then forwards them to the prefill worker in round-robin fashion. The prefill worker processes the request and returns results to the decode worker for any remaining decode operations. + +When using KV routing, however, Dynamo switches to a `prefill first` strategy. In this mode, requests are routed directly to the prefill worker, which can help maximize KV cache reuse and improve overall efficiency for certain workloads. Choosing the appropriate strategy can have a significant impact on performance, depending on your use case. + +The disaggregation strategy can be set using the `DISAGGREGATION_STRATEGY` environment variable. You can set the strategy before launching your deployment, for example: +```bash +DISAGGREGATION_STRATEGY="prefill_first" ./launch/disagg.sh +``` + +## KV Cache Transfer in Disaggregated Serving + +Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disaggregated serving: UCX (default) and NIXL (experimental). For detailed information and configuration instructions for each method, see the [KV cache transfer guide](kv-cache-tranfer.md). + +## Request Migration + +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. + +The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. + +For ongoing requests, there is a `--migration-limit` flag which can be set on the Backend that tells the Frontend how many times a request can be migrated to another Backend should there be a loss of connectivity to the current Backend. + +For example, +```bash +python3 -m dynamo.trtllm ... --migration-limit=3 +``` +indicates a request to this model may be migrated up to 3 times to another Backend, before failing the request, should the Frontend detects a connectivity issue to the current Backend. + +The migrated request will continue responding to the original request, allowing for a seamless transition between Backends, and a reduced overall request failure rate at the Frontend for enhanced user experience. + +## Client + +NOTE: To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `. diff --git a/docs/components/backends/trtllm/deploy/README.md b/docs/components/backends/trtllm/deploy/README.md new file mode 100644 index 0000000000..1b718e7023 --- /dev/null +++ b/docs/components/backends/trtllm/deploy/README.md @@ -0,0 +1,286 @@ +# TensorRT-LLM Kubernetes Deployment Configurations + +This directory contains Kubernetes Custom Resource Definition (CRD) templates for deploying TensorRT-LLM inference graphs using the **DynamoGraphDeployment** resource. + +## Available Deployment Patterns + +### 1. **Aggregated Deployment** (`agg.yaml`) +Basic deployment pattern with frontend and a single worker. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server (with kv router mode disabled) +- `TRTLLMWorker`: Single worker handling both prefill and decode + +### 2. **Aggregated Router Deployment** (`agg_router.yaml`) +Enhanced aggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server (with kv router mode enabled) +- `TRTLLMWorker`: Multiple workers handling both prefill and decode (2 replicas for load balancing) + +### 3. **Disaggregated Deployment** (`disagg.yaml`) +High-performance deployment with separated prefill and decode workers. + +**Architecture:** +- `Frontend`: HTTP API server coordinating between workers +- `TRTLLMDecodeWorker`: Specialized decode-only worker +- `TRTLLMPrefillWorker`: Specialized prefill-only worker + +### 4. **Disaggregated Router Deployment** (`disagg_router.yaml`) +Advanced disaggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: HTTP API server (with kv router mode enabled) +- `TRTLLMDecodeWorker`: Specialized decode-only worker +- `TRTLLMPrefillWorker`: Specialized prefill-only worker (2 replicas for load balancing) + +## CRD Structure + +All templates use the **DynamoGraphDeployment** CRD: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: +spec: + services: + : + # Service configuration +``` + +### Key Configuration Options + +**Resource Management:** +```yaml +resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" +``` + +**Container Configuration:** +```yaml +extraPodSpec: + mainContainer: + image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 + workingDir: /workspace/components/backends/trtllm + args: + - "python3" + - "-m" + - "dynamo.trtllm" + # Model-specific arguments +``` + +## Prerequisites + +Before using these templates, ensure you have: + +1. **Dynamo Cloud Platform installed** - See [Quickstart Guide](../../../../guides/dynamo_deploy/quickstart.md) +2. **Kubernetes cluster with GPU support** +3. **Container registry access** for TensorRT-LLM runtime images +4. **HuggingFace token secret** (referenced as `envFromSecret: hf-token-secret`) + +### Container Images + +The deployment files currently require access to `nvcr.io/nvidian/nim-llm-dev/trtllm-runtime`. If you don't have access, build and push your own image: + +```bash +./container/build.sh --framework tensorrtllm +# Tag and push to your container registry +# Update the image references in the YAML files +``` + +**Note:** TensorRT-LLM uses git-lfs, which needs to be installed in advance: +```bash +apt-get update && apt-get -y install git git-lfs +``` + +For ARM machines, use: +```bash +./container/build.sh --framework tensorrtllm --platform linux/arm64 +``` + +## Usage + +### 1. Choose Your Template +Select the deployment pattern that matches your requirements: +- Use `agg.yaml` for simple testing +- Use `agg_router.yaml` for production with KV cache routing and load balancing +- Use `disagg.yaml` for maximum performance with separated workers +- Use `disagg_router.yaml` for high-performance with KV cache routing and disaggregation + +### 2. Customize Configuration +Edit the template to match your environment: + +```yaml +# Update image registry and tag +image: your-registry/trtllm-runtime:your-tag + +# Configure your model and deployment settings +args: + - "python3" + - "-m" + - "dynamo.trtllm" + # Add your model-specific arguments +``` + +### 3. Deploy + +See the [Create Deployment Guide](../../../../guides/dynamo_deploy/create_deployment.md) to learn how to deploy the deployment file. + +First, create a secret for the HuggingFace token. +```bash +export HF_TOKEN=your_hf_token +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + +Then, deploy the model using the deployment file. + +Export the NAMESPACE you used in your Dynamo Cloud Installation. + +```bash +cd dynamo/components/backends/trtllm/deploy +export DEPLOYMENT_FILE=agg.yaml +kubectl apply -f $DEPLOYMENT_FILE -n $NAMESPACE +``` + +### 4. Using Custom Dynamo Frameworks Image for TensorRT-LLM + +To use a custom dynamo frameworks image for TensorRT-LLM, you can update the deployment file using yq: + +```bash +export DEPLOYMENT_FILE=agg.yaml +export FRAMEWORK_RUNTIME_IMAGE= + +yq '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE > $DEPLOYMENT_FILE.generated +kubectl apply -f $DEPLOYMENT_FILE.generated -n $NAMESPACE +``` + +### 5. Port Forwarding + +After deployment, forward the frontend service to access the API: + +```bash +kubectl port-forward deployment/trtllm-v1-disagg-frontend- 8000:8000 +``` + +## Configuration Options + +### Environment Variables + +To change `DYN_LOG` level, edit the yaml file by adding: + +```yaml +... +spec: + envs: + - name: DYN_LOG + value: "debug" # or other log levels + ... +``` + +### TensorRT-LLM Worker Configuration + +TensorRT-LLM workers are configured through command-line arguments in the deployment YAML. Key configuration areas include: + +- **Disaggregation Strategy**: Control request flow with `DISAGGREGATION_STRATEGY` environment variable +- **KV Cache Transfer**: Choose between UCX (default) or NIXL for disaggregated serving +- **Request Migration**: Enable graceful failure handling with `--migration-limit` + +### Disaggregation Strategy + +The disaggregation strategy controls how requests are distributed between prefill and decode workers: + +- **`decode_first`** (default): Requests routed to decode worker first, then forwarded to prefill worker +- **`prefill_first`**: Requests routed directly to prefill worker (used with KV routing) + +Set via environment variable: +```yaml +envs: + - name: DISAGGREGATION_STRATEGY + value: "prefill_first" +``` + +**Note:** For multi-node deployments, target the node running `python3 -m dynamo.frontend `. + +## Model Configuration + +The deployment templates support various TensorRT-LLM models and configurations. You can customize model-specific arguments in the worker configuration sections of the YAML files. + +### Multi-Token Prediction (MTP) Support + +For models supporting Multi-Token Prediction (such as DeepSeek R1), special configuration is available. Note that MTP requires the experimental TensorRT-LLM commit: + +```bash +./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit +``` + +## Monitoring and Health + +- **Frontend health endpoint**: `http://:8000/health` +- **Worker health endpoints**: `http://:9090/health` +- **Liveness probes**: Check process health every 5 seconds +- **Readiness probes**: Check service readiness with configurable delays + +## KV Cache Transfer Methods + +TensorRT-LLM supports two methods for KV cache transfer in disaggregated serving: + +- **UCX** (default): Standard method for KV cache transfer +- **NIXL** (experimental): Alternative transfer method + +For detailed configuration instructions, see the [KV cache transfer guide](../kv-cache-tranfer.md). + +## Request Migration + +You can enable request migration to handle worker failures gracefully by adding the migration limit argument to worker configurations: + +```yaml +args: + - "python3" + - "-m" + - "dynamo.trtllm" + - "--migration-limit" + - "3" +``` + +## Benchmarking + +To benchmark your deployment with GenAI-Perf, see this utility script: +```bash +{REPO_ROOT}/benchmarks/llm/perf.sh +``` +Configure the `model` name and `host` based on your deployment. + +## Further Reading + +- **Deployment Guide**: [Creating Kubernetes Deployments](../../../../guides/dynamo_deploy/create_deployment.md) +- **Quickstart**: [Deployment Quickstart](../../../../guides/dynamo_deploy/quickstart.md) +- **Platform Setup**: [Dynamo Cloud Installation](../../../../guides/dynamo_deploy/dynamo_cloud.md) +- **Examples**: [Deployment Examples](../../../../examples/README.md) +- **Architecture Docs**: [Disaggregated Serving](../../../../architecture/disagg_serving.md), [KV-Aware Routing](../../../../architecture/kv_cache_routing.md) +- **Multinode Deployment**: [Multinode Examples](../multinode-examples.md) +- **Speculative Decoding**: [Llama 4 + Eagle Guide](../llama4_plus_eagle.md) +- **Kubernetes CRDs**: [Custom Resources Documentation](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/) + +## Troubleshooting + +Common issues and solutions: + +1. **Pod fails to start**: Check image registry access and HuggingFace token secret +2. **GPU not allocated**: Verify cluster has GPU nodes and proper resource limits +3. **Health check failures**: Review model loading logs and increase `initialDelaySeconds` +4. **Out of memory**: Increase memory limits or reduce model batch size +5. **Port forwarding issues**: Ensure correct pod UUID in port-forward command +6. **Git LFS issues**: Ensure git-lfs is installed before building containers +7. **ARM deployment**: Use `--platform linux/arm64` when building on ARM machines + +For additional support, refer to the [deployment troubleshooting guide](../../../../guides/dynamo_deploy/quickstart.md). diff --git a/components/backends/trtllm/kv-cache-tranfer.md b/docs/components/backends/trtllm/kv-cache-tranfer.md similarity index 97% rename from components/backends/trtllm/kv-cache-tranfer.md rename to docs/components/backends/trtllm/kv-cache-tranfer.md index 14247f71fe..f604f158a8 100644 --- a/components/backends/trtllm/kv-cache-tranfer.md +++ b/docs/components/backends/trtllm/kv-cache-tranfer.md @@ -56,7 +56,7 @@ To enable NIXL for KV cache transfer in disaggregated serving: See [run container](./README.md#run-container) section to learn how to start the container image built in previous step. 3. **Start the disaggregated service:** - See [disaggregated serving](./README.md#disaggregated-serving) to see how to start the deployment. + See [disaggregated serving](./README.md) to see how to start the deployment. 4. **Send the request:** See [client](./README.md#client) section to learn how to send the request to deployment. diff --git a/components/backends/trtllm/llama4_plus_eagle.md b/docs/components/backends/trtllm/llama4_plus_eagle.md similarity index 93% rename from components/backends/trtllm/llama4_plus_eagle.md rename to docs/components/backends/trtllm/llama4_plus_eagle.md index 2d542f7a1a..1a59954896 100644 --- a/components/backends/trtllm/llama4_plus_eagle.md +++ b/docs/components/backends/trtllm/llama4_plus_eagle.md @@ -17,7 +17,7 @@ limitations under the License. # Llama 4 Maverick Instruct with Eagle Speculative Decoding on SLURM -This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Speculative Decoding on GB200x4 nodes. We will be following the [multi-node deployment instructions](./multinode/multinode-examples.md) to set up the environment for the following scenarios: +This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Speculative Decoding on GB200x4 nodes. We will be following the [multi-node deployment instructions](./multinode-examples.md) to set up the environment for the following scenarios: - **Aggregated Serving:** Deploy the entire Llama 4 model on a single GB200x4 node for end-to-end serving. @@ -57,7 +57,7 @@ export MODEL_PATH="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" export SERVED_MODEL_NAME="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" ``` -See [this](./multinode/multinode-examples.md#setup) section from multinode guide to learn more about the above options. +See [this](multinode-examples.md#setup) section from multinode guide to learn more about the above options. ## Aggregated Serving @@ -82,7 +82,7 @@ export DECODE_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_decode.yaml" ## Example Request -See [here](./multinode/multinode-examples.md#example-request) to learn how to send a request to the deployment. +See [here](./multinode-examples.md#example-request) to learn how to send a request to the deployment. ``` curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ diff --git a/components/backends/trtllm/multinode/multinode-examples.md b/docs/components/backends/trtllm/multinode-examples.md similarity index 100% rename from components/backends/trtllm/multinode/multinode-examples.md rename to docs/components/backends/trtllm/multinode-examples.md diff --git a/docs/components/backends/vllm/README.md b/docs/components/backends/vllm/README.md deleted file mode 120000 index ec40eb5e49..0000000000 --- a/docs/components/backends/vllm/README.md +++ /dev/null @@ -1 +0,0 @@ -../../../../components/backends/vllm/README.md \ No newline at end of file diff --git a/docs/components/backends/vllm/README.md b/docs/components/backends/vllm/README.md new file mode 100644 index 0000000000..7b2744801d --- /dev/null +++ b/docs/components/backends/vllm/README.md @@ -0,0 +1,184 @@ + + +# LLM Deployment using vLLM + +This directory contains a Dynamo vllm engine and reference implementations for deploying Large Language Models (LLMs) in various configurations using vLLM. For Dynamo integration, we leverage vLLM's native KV cache events, NIXL based transfer mechanisms, and metric reporting to enable KV-aware routing and P/D disaggregation. + +## Use the Latest Release + +We recommend using the latest stable release of Dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +--- + +## Table of Contents +- [Feature Support Matrix](#feature-support-matrix) +- [Quick Start](#quick-start) +- [Single Node Examples](#run-single-node-examples) +- [Advanced Examples](#advanced-examples) +- [Deploy on Kubernetes](#kubernetes-deployment) +- [Configuration](#configuration) + +## Feature Support Matrix + +### Core Dynamo Features + +| Feature | vLLM | Notes | +|---------|------|-------| +| [**Disaggregated Serving**](../../../architecture/disagg_serving.md) | βœ… | | +| [**Conditional Disaggregation**](../../../architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP | +| [**KV-Aware Routing**](../../../architecture/kv_cache_routing.md) | βœ… | | +| [**SLA-Based Planner**](../../../architecture/sla_planner.md) | βœ… | | +| [**Load Based Planner**](../../../architecture/load_planner.md) | 🚧 | WIP | +| [**KVBM**](../../../architecture/kvbm_architecture.md) | 🚧 | WIP | + +### Large Scale P/D and WideEP Features + +| Feature | vLLM | Notes | +|--------------------|------|-----------------------------------------------------------------------| +| **WideEP** | βœ… | Support for PPLX / DeepEP not verified | +| **Attention DP** | βœ… | Supported via external control of DP ranks | +| **GB200 Support** | 🚧 | Container functional on main | + +## Quick Start + +Below we provide a guide that lets you run all of our the common deployment patterns on a single node. + +### Start NATS and ETCD in the background + +Start using Docker Compose + +```bash +docker compose -f deploy/docker-compose.yml up -d +``` + +### Pull or build container + +We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts). If you'd like to build your own container from source: + +```bash +./container/build.sh --framework VLLM +``` + +### Run container + +```bash +./container/run.sh -it --framework VLLM [--mount-workspace] +``` + +This includes the specific commit [vllm-project/vllm#19790](https://github.com/vllm-project/vllm/pull/19790) which enables support for external control of the DP ranks. + +## Run Single Node Examples + +> [!IMPORTANT] +> Below we provide simple shell scripts that run the components for each configuration. Each shell script runs `python3 -m dynamo.frontend` to start the ingress and uses `python3 -m dynamo.vllm` to start the vLLM workers. You can also run each command in separate terminals for better log visibility. + +This figure shows an overview of the major components to deploy: + +``` ++------+ +-----------+ +------------------+ +---------------+ +| HTTP |----->| dynamo |----->| vLLM Worker |------------>| vLLM Prefill | +| |<-----| ingress |<-----| |<------------| Worker | ++------+ +-----------+ +------------------+ +---------------+ + | ^ | + query best | | return | publish kv events + worker | | worker_id v + | | +------------------+ + | +---------| kv-router | + +------------->| | + +------------------+ +``` + +Note: The above architecture illustrates all the components. The final components that get spawned depend upon the chosen deployment pattern. + +### Aggregated Serving + +```bash +# requires one gpu +cd components/backends/vllm +bash launch/agg.sh +``` + +### Aggregated Serving with KV Routing + +```bash +# requires two gpus +cd components/backends/vllm +bash launch/agg_router.sh +``` + +### Disaggregated Serving + +```bash +# requires two gpus +cd components/backends/vllm +bash launch/disagg.sh +``` + +### Disaggregated Serving with KV Routing + +```bash +# requires three gpus +cd components/backends/vllm +bash launch/disagg_router.sh +``` + +### Single Node Data Parallel Attention / Expert Parallelism + +This example is not meant to be performant but showcases Dynamo routing to data parallel workers + +```bash +# requires four gpus +cd components/backends/vllm +bash launch/dep.sh +``` + +> [!TIP] +> Run a disaggregated example and try adding another prefill worker once the setup is running! The system will automatically discover and utilize the new worker. + +## Advanced Examples + +Below we provide a selected list of advanced deployments. Please open up an issue if you'd like to see a specific example! + +### Kubernetes Deployment + +For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [vLLM Kubernetes Deployment Guide](deploy/README.md) + +## Configuration + +vLLM workers are configured through command-line arguments. Key parameters include: + +- `--endpoint`: Dynamo endpoint in format `dyn://namespace.component.endpoint` +- `--model`: Model to serve (e.g., `Qwen/Qwen3-0.6B`) +- `--is-prefill-worker`: Enable prefill-only mode for disaggregated serving +- `--metrics-endpoint-port`: Port for publishing KV metrics to Dynamo + +See `args.py` for the full list of configuration options and their defaults. + +The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html?h=serve+arg) for the vLLM CLI args points to running 'vllm serve --help' to see what CLI args can be added. We use the same argument parser as vLLM. + +## Request Migration + +In a Distributed System, a request may fail due to connectivity issues between the Frontend and the Backend. + +The Frontend will automatically track which Backends are having connectivity issues with it and avoid routing new requests to the Backends with known connectivity issues. + +For ongoing requests, there is a `--migration-limit` flag which can be set on the Backend that tells the Frontend how many times a request can be migrated to another Backend should there be a loss of connectivity to the current Backend. + +For example, +```bash +python3 -m dynamo.vllm ... --migration-limit=3 +``` +indicates a request to this model may be migrated up to 3 times to another Backend, before failing the request, should the Frontend detects a connectivity issue to the current Backend. + +The migrated request will continue responding to the original request, allowing for a seamless transition between Backends, and a reduced overall request failure rate at the Frontend for enhanced user experience. diff --git a/docs/components/backends/vllm/deploy/README.md b/docs/components/backends/vllm/deploy/README.md new file mode 100644 index 0000000000..1176cd3829 --- /dev/null +++ b/docs/components/backends/vllm/deploy/README.md @@ -0,0 +1,255 @@ +# vLLM Kubernetes Deployment Configurations + +This directory contains Kubernetes Custom Resource Definition (CRD) templates for deploying vLLM inference graphs using the **DynamoGraphDeployment** resource. + +## Available Deployment Patterns + +### 1. **Aggregated Deployment** (`agg.yaml`) +Basic deployment pattern with frontend and a single decode worker. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server (with kv router mode disabled) +- `VLLMDecodeWorker`: Single worker handling both prefill and decode + +### 2. **Aggregated Router Deployment** (`agg_router.yaml`) +Enhanced aggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: OpenAI-compatible API server (with kv router mode enabled) +- `VLLMDecodeWorker`: Single worker handling both prefill and decode + +### 3. **Disaggregated Deployment** (`disagg.yaml`) +High-performance deployment with separated prefill and decode workers. + +**Architecture:** +- `Frontend`: HTTP API server coordinating between workers +- `VLLMDecodeWorker`: Specialized decode-only worker +- `VLLMPrefillWorker`: Specialized prefill-only worker (`--is-prefill-worker`) +- Communication via NIXL transfer backend + +### 4. **Disaggregated Router Deployment** (`disagg_router.yaml`) +Advanced disaggregated deployment with KV cache routing capabilities. + +**Architecture:** +- `Frontend`: HTTP API server with KV-aware routing +- `VLLMDecodeWorker`: Specialized decode-only worker +- `VLLMPrefillWorker`: Specialized prefill-only worker (`--is-prefill-worker`) + +## CRD Structure + +All templates use the **DynamoGraphDeployment** CRD: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: +spec: + services: + : + # Service configuration +``` + +### Key Configuration Options + +**Resource Management:** +```yaml +resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" +``` + +**Container Configuration:** +```yaml +extraPodSpec: + mainContainer: + image: my-registry/vllm-runtime:my-tag + workingDir: /workspace/components/backends/vllm + args: + - "python3" + - "-m" + - "dynamo.vllm" + # Model-specific arguments +``` + +## Prerequisites + +Before using these templates, ensure you have: + +1. **Dynamo Cloud Platform installed** - See [Quickstart Guide](../../../../guides/dynamo_deploy/quickstart.md) +2. **Kubernetes cluster with GPU support** +3. **Container registry access** for vLLM runtime images +4. **HuggingFace token secret** (referenced as `envFromSecret: hf-token-secret`) + +### Container Images + +We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts). If you'd prefer to use your own registry, build and push your own image: + +```bash +./container/build.sh --framework VLLM +# Tag and push to your container registry +# Update the image references in the YAML files +``` + +### Pre-Deployment Profiling (SLA Planner Only) + +If using the SLA Planner deployment (`disagg_planner.yaml`), follow the [pre-deployment profiling guide](../../../../architecture/pre_deployment_profiling.md) to run pre-deployment profiling. The results will be saved to the `profiling-pvc` PVC and queried by the SLA Planner. + +## Usage + +### 1. Choose Your Template +Select the deployment pattern that matches your requirements: +- Use `agg.yaml` for simple testing +- Use `agg_router.yaml` for production with load balancing +- Use `disagg.yaml` for maximum performance +- Use `disagg_router.yaml` for high-performance with KV cache routing +- Use `disagg_planner.yaml` for SLA-optimized performance + +### 2. Customize Configuration +Edit the template to match your environment: + +```yaml +# Update image registry and tag +image: your-registry/vllm-runtime:your-tag + +# Configure your model +args: + - "--model" + - "your-org/your-model" +``` + +### 3. Deploy + +Use the following command to deploy the deployment file. + +First, create a secret for the HuggingFace token. +```bash +export HF_TOKEN=your_hf_token +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + +Then, deploy the model using the deployment file. + +Export the NAMESPACE you used in your Dynamo Cloud Installation. + +```bash +cd /components/backends/vllm/deploy +export DEPLOYMENT_FILE=agg.yaml + +kubectl apply -f $DEPLOYMENT_FILE -n $NAMESPACE +``` + +### 4. Using Custom Dynamo Frameworks Image for vLLM + +To use a custom dynamo frameworks image for vLLM, you can update the deployment file using yq: + +```bash +export DEPLOYMENT_FILE=agg.yaml +export FRAMEWORK_RUNTIME_IMAGE= + +yq '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE > $DEPLOYMENT_FILE.generated +kubectl apply -f $DEPLOYMENT_FILE.generated -n $NAMESPACE +``` + +### 5. Port Forwarding + +After deployment, forward the frontend service to access the API: + +```bash +kubectl port-forward deployment/vllm-v1-disagg-frontend- 8000:8000 +``` + +## Configuration Options + +### Environment Variables + +To change `DYN_LOG` level, edit the yaml file by adding: + +```yaml +... +spec: + envs: + - name: DYN_LOG + value: "debug" # or other log levels + ... +``` + +### vLLM Worker Configuration + +vLLM workers are configured through command-line arguments. Key parameters include: + +- `--endpoint`: Dynamo endpoint in format `dyn://namespace.component.endpoint` +- `--model`: Model to serve (e.g., `Qwen/Qwen3-0.6B`) +- `--is-prefill-worker`: Enable prefill-only mode for disaggregated serving +- `--metrics-endpoint-port`: Port for publishing KV metrics to Dynamo + +See the [vLLM CLI documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html?h=serve+arg) for the full list of configuration options. + +## Testing the Deployment + +Send a test request to verify your deployment: + +```bash +curl localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [ + { + "role": "user", + "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." + } + ], + "stream": false, + "max_tokens": 30 + }' +``` + +## Model Configuration + +All templates use **Qwen/Qwen3-0.6B** as the default model, but you can use any vLLM-supported LLM model and configuration arguments. + +## Monitoring and Health + +- **Frontend health endpoint**: `http://:8000/health` +- **Liveness probes**: Check process health regularly +- **KV metrics**: Published via metrics endpoint port + +## Request Migration + +You can enable request migration to handle worker failures gracefully by adding the migration limit argument to worker configurations: + +```yaml +args: + - "--migration-limit" + - "3" +``` + +## Further Reading + +- **Deployment Guide**: [Creating Kubernetes Deployments](../../../../guides/dynamo_deploy/create_deployment.md) +- **Quickstart**: [Deployment Quickstart](../../../../guides/dynamo_deploy/quickstart.md) +- **Platform Setup**: [Dynamo Cloud Installation](../../../../guides/dynamo_deploy/dynamo_cloud.md) +- **SLA Planner**: [SLA Planner Deployment Guide](../../../../guides/dynamo_deploy/sla_planner_deployment.md) +- **Examples**: [Deployment Examples](../../../../examples/README.md) +- **Architecture Docs**: [Disaggregated Serving](../../../../architecture/disagg_serving.md), [KV-Aware Routing](../../../../architecture/kv_cache_routing.md) + +## Troubleshooting + +Common issues and solutions: + +1. **Pod fails to start**: Check image registry access and HuggingFace token secret +2. **GPU not allocated**: Verify cluster has GPU nodes and proper resource limits +3. **Health check failures**: Review model loading logs and increase `initialDelaySeconds` +4. **Out of memory**: Increase memory limits or reduce model batch size +5. **Port forwarding issues**: Ensure correct pod UUID in port-forward command + +For additional support, refer to the [deployment troubleshooting guide](../../../../guides/dynamo_deploy/quickstart.md). diff --git a/docs/components/backends/vllm/multi-node.md b/docs/components/backends/vllm/multi-node.md new file mode 100644 index 0000000000..6cf928104b --- /dev/null +++ b/docs/components/backends/vllm/multi-node.md @@ -0,0 +1,110 @@ + + +# Multi-node Examples + +This guide covers deploying vLLM across multiple nodes using Dynamo's distributed capabilities. + +## Prerequisites + +Multi-node deployments require: +- Multiple nodes with GPU resources +- Network connectivity between nodes (faster the better) +- Firewall rules allowing NATS/ETCD communication + +## Infrastructure Setup + +### Step 1: Start NATS/ETCD on Head Node + +Start the required services on your head node. These endpoints must be accessible from all worker nodes: + +```bash +# On head node (node-1) +docker compose -f deploy/docker-compose.yml up -d +``` + +Default ports: +- NATS: 4222 +- ETCD: 2379 + +### Step 2: Configure Environment Variables + +Set the head node IP address and service endpoints. **Set this on all nodes** for easy copy-paste: + +```bash +# Set this on ALL nodes - replace with your actual head node IP +export HEAD_NODE_IP="" + +# Service endpoints (set on all nodes) +export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" +export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" +``` + +## Deployment Patterns + +### Multi-node Aggregated Serving + +Deploy vLLM workers across multiple nodes for horizontal scaling: + +**Node 1 (Head Node)**: Run ingress and first worker +```bash +# Start ingress +python -m dynamo.frontend --router-mode kv + +# Start vLLM worker +python -m dynamo.vllm \ + --model meta-llama/Llama-3.3-70B-Instruct \ + --tensor-parallel-size 8 \ + --enforce-eager +``` + +**Node 2**: Run additional worker +```bash +# Start vLLM worker +python -m dynamo.vllm \ + --model meta-llama/Llama-3.3-70B-Instruct \ + --tensor-parallel-size 8 \ + --enforce-eager +``` + +### Multi-node Disaggregated Serving + +Deploy prefill and decode workers on separate nodes for optimized resource utilization: + +**Node 1**: Run ingress and prefill workers +```bash +# Start ingress +python -m dynamo.frontend --router-mode kv & + +# Start prefill worker +python -m dynamo.vllm \ + --model meta-llama/Llama-3.3-70B-Instruct + --tensor-parallel-size 8 \ + --enforce-eager +``` + +**Node 2**: Run decode workers +```bash +# Start decode worker +python -m dynamo.vllm \ + --model meta-llama/Llama-3.3-70B-Instruct + --tensor-parallel-size 8 \ + --enforce-eager \ + --is-prefill-worker +``` + + +## TODO + +## Large Model Deployment + +For models requiring more GPUs than available on a single node such as tensor-parallel-size 16: + +**Node 1**: First part of tensor-parallel model +```bash +# Start ingress +python -m dynamo.frontend --router-mode kv & +``` + diff --git a/docs/examples/README.md b/docs/examples/README.md index f9e22535d8..123b127162 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -4,9 +4,9 @@ Follow individual examples under components/backends/ to serve models locally. -For example follow the [vLLM Backend Example](../../components/backends/vllm/README.md) +For example follow the [vLLM Backend Example](../components/backends/vllm/README.md) -For a basic GPU - unaware example see the [Hello World Example](../../examples/runtime/hello_world/README.md) +For a basic GPU - unaware example see the [Hello World Example](../examples/runtime/hello_world/README.md) ## Deploying Examples to Kubernetes @@ -19,7 +19,6 @@ If you are a **πŸ‘€ Dynamo User** first follow the [Quickstart Guide](../guides/ ### Instructions for Dynamo Contributor If you are a **πŸ§‘β€πŸ’» Dynamo Contributor** you may have to rebuild the dynamo platform images as the code evolves. For more details read the [Cloud Guide](../guides/dynamo_deploy/dynamo_cloud.md) -Read more on deploying Dynamo Cloud read [deploy/cloud/helm/README.md](../../deploy/cloud/helm/README.md). ### Deploying a particular example @@ -40,9 +39,14 @@ kubectl apply -f components/backends/vllm/deploy/agg.yaml -n ${NAMESPACE} You can use `kubectl get dynamoGraphDeployment -n ${NAMESPACE}` to view your deployment. You can use `kubectl delete dynamoGraphDeployment -n ${NAMESPACE}` to delete the deployment. -We provide a Custom Resource yaml file for many examples under the `deploy/` folder. -Use [VLLM YAML](../../components/backends/vllm/deploy/agg.yaml) for an example. +We provide a Custom Resource yaml file for many examples under the `components/backends//deploy/`folder. +Consult the examples below for the CRs for your specific inference backend. +[View SGLang k8s](../components/backends/sglang/README.md) + +[View vLLM K8s](../components/backends/vllm/README.md#kubernetes-deployment) + +[View TRTLLM k8s](../components/backends/trtllm/deploy/README.md) **Note 1** Example Image diff --git a/docs/examples/runtime/hello_world/README.md b/docs/examples/runtime/hello_world/README.md deleted file mode 120000 index aa7e284f34..0000000000 --- a/docs/examples/runtime/hello_world/README.md +++ /dev/null @@ -1 +0,0 @@ -../../../../examples/runtime/hello_world/README.md \ No newline at end of file diff --git a/docs/examples/runtime/hello_world/README.md b/docs/examples/runtime/hello_world/README.md new file mode 100644 index 0000000000..3ba390050d --- /dev/null +++ b/docs/examples/runtime/hello_world/README.md @@ -0,0 +1,119 @@ + + +# Hello World Example + +This is the simplest Dynamo example demonstrating a basic service using Dynamo's distributed runtime. It showcases the fundamental concepts of creating endpoints and workers in the Dynamo runtime system. + +## Architecture + +```text +Client (dynamo_worker) + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Backend β”‚ Dynamo endpoint (/generate) +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Components + +- **Backend**: A Dynamo service with an endpoint that receives text input and streams back greetings for each comma-separated word +- **Client**: A Dynamo worker that connects to and sends requests to the backend service, then prints out the response + +## Implementation Details + +The example demonstrates: + +- **Endpoint Definition**: Using the `@dynamo_endpoint` decorator to create streaming endpoints +- **Worker Setup**: Using the `@dynamo_worker()` decorator to create distributed runtime workers +- **Service Creation**: Creating services and endpoints using the distributed runtime API +- **Streaming Responses**: Yielding data for real-time streaming +- **Client Integration**: Connecting to services and processing streams +- **Logging**: Basic logging configuration with `configure_dynamo_logging` + +## Getting Started + +## Prerequisites + + Before running this example, ensure you have the following services running: + + - **etcd**: A distributed key-value store used for service discovery and metadata storage + - **NATS**: A high-performance message broker for inter-component communication + + You can start these services using Docker Compose: + + ```bash + # clone the dynamo repository if necessary + # git clone https://github.com/ai-dynamo/dynamo.git + cd dynamo + docker compose -f deploy/docker-compose.yml up -d + ``` + +### Running the Example + +First, start the backend service: +```bash +cd examples/runtime/hello_world +python hello_world.py +``` + +Second, in a separate terminal, run the client: +```bash +cd examples/runtime/hello_world +python client.py +``` + +The client will connect to the backend service and print the streaming results. + +### Expected Output + +When running the client, you should see streaming output like: +```text +Hello world! +Hello sun! +Hello moon! +Hello star! +``` + +## Code Structure + +### Backend Service (`hello_world.py`) + +- **`content_generator`**: A dynamo endpoint that processes text input and yields greetings +- **`worker`**: A dynamo worker that sets up the service, creates the endpoint, and serves it + +### Client (`client.py`) + +- **`worker`**: A dynamo worker that connects to the backend service and processes the streaming response + +## Deployment to Kubernetes + +Follow the [Quickstart Guide](../../../guides/dynamo_deploy/quickstart.md) to install Dynamo Cloud. +Then deploy to kubernetes using + +```bash +export NAMESPACE= +cd dynamo +kubectl apply -f examples/runtime/hello_world/deploy/hello_world.yaml -n ${NAMESPACE} +``` + +to delete your deployment: + +```bash +kubectl delete dynamographdeployment hello-world -n ${NAMESPACE} +``` diff --git a/docs/guides/dynamo_deploy/README.md b/docs/guides/dynamo_deploy/README.md index 516162d911..3cf2ccc098 100644 --- a/docs/guides/dynamo_deploy/README.md +++ b/docs/guides/dynamo_deploy/README.md @@ -17,17 +17,84 @@ limitations under the License. # Deploying Inference Graphs to Kubernetes -We expect users to deploy their inference graphs using CRDs or helm charts. + We expect users to deploy their inference graphs using CRDs or helm charts. + +# 1. Install Dynamo Cloud. + +Prior to deploying an inference graph the user should deploy the Dynamo Cloud Platform. Reference the [Quickstart Guide](quickstart.md) for steps to install Dynamo Cloud with Helm. -Prior to deploying an inference graph the user should deploy the Dynamo Cloud Platform. Dynamo Cloud acts as an orchestration layer between the end user and Kubernetes, handling the complexity of deploying your graphs for you. This is a one-time action, only necessary the first time you deploy a DynamoGraph. +# 2. Deploy your inference graph. + +We provide a Custom Resource YAML file for many examples under the components/backends/{engine}/deploy folders. Consult the examples below for the CRs for a specific inference backend. + +[View SGLang K8s](../../components/backends/sglang/deploy/README.md) + +[View vLLM K8s](../../components/backends/vllm/deploy/README.md) + +[View TRT-LLM K8s](../../components/backends/trtllm/deploy/README.md) + +### Deploying a particular example + +```bash +# Set your dynamo root directory +cd +export PROJECT_ROOT=$(pwd) +export NAMESPACE= # the namespace you used to deploy Dynamo cloud to. +``` + +Deploying an example consists of the simple `kubectl apply -f ... -n ${NAMESPACE}` command. For example: + +```bash +kubectl apply -f components/backends/vllm/deploy/agg.yaml -n ${NAMESPACE} +``` + +You can use `kubectl get dynamoGraphDeployment -n ${NAMESPACE}` to view your deployment. +You can use `kubectl delete dynamoGraphDeployment -n ${NAMESPACE}` to delete the deployment. -# 1. Please follow [Installing Dynamo Cloud](./dynamo_cloud.md) for steps to install. -For details about the Dynamo Cloud Platform, see the [Dynamo Operator Guide](dynamo_operator.md) +**Note 1** Example Image -# 2. Follow [Examples](../../examples/README.md) to see how you can deploy your Inference Graphs. +The examples use a prebuilt image from the `nvcr.io` registry. +You can utilize public images from [Dynamo NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo) or build your own image and update the image location in your CR file prior to applying. Either way, you will need to overwrite the image in the example YAML. +To build your own image: + +```bash +./container/build.sh --framework +``` + +For example for the `sglang` run +```bash +./container/build.sh --framework sglang +``` + +To overwrite the image in the example: + +```bash +extraPodSpec: + mainContainer: + image: +``` + +**Note 2** +Setup port forward if needed when deploying to Kubernetes. + +List the services in your namespace: + +```bash +kubectl get svc -n ${NAMESPACE} +``` +Look for one that ends in `-frontend` and use it for port forward. + +```bash +SERVICE_NAME=$(kubectl get svc -n ${NAMESPACE} -o name | grep frontend | sed 's|.*/||' | sed 's|-frontend||' | head -n1) +kubectl port-forward svc/${SERVICE_NAME}-frontend 8080:8080 -n ${NAMESPACE} +``` + +Additional Resources: +- [Port Forward Documentation](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/) +- [Examples Deployment Guide](../../examples/README.md#deploying-a-particular-example) ## Manual Deployment with Helm Charts @@ -38,5 +105,5 @@ Users who need more control over their deployments can use the manual deployment - Provides full control over deployment parameters - Requires manual management of infrastructure components - Documentation: - - [Using the Deployment Script](manual_helm_deployment.md#using-the-deployment-script): all-in-one script for manual deployment - - [Helm Deployment Guide](manual_helm_deployment.md#helm-deployment-guide): detailed instructions for manual deployment + - [Helm Deployment Guide](helm_install.md): detailed instructions for manual deployment + diff --git a/docs/guides/dynamo_deploy/dynamo_operator.md b/docs/guides/dynamo_deploy/dynamo_operator.md index 4d3c2a04eb..9e52384da9 100644 --- a/docs/guides/dynamo_deploy/dynamo_operator.md +++ b/docs/guides/dynamo_deploy/dynamo_operator.md @@ -75,7 +75,7 @@ spec: ## GitOps Deployment with FluxCD -This section describes how to use FluxCD for GitOps-based deployment of Dynamo inference graphs. GitOps enables you to manage your Dynamo deployments declaratively using Git as the source of truth. We'll use the [aggregated vLLM example](../../../components/backends/vllm/README.md) to demonstrate the workflow. +This section describes how to use FluxCD for GitOps-based deployment of Dynamo inference graphs. GitOps enables you to manage your Dynamo deployments declaratively using Git as the source of truth. We'll use the [aggregated vLLM example](../../components/backends/vllm/README.md) to demonstrate the workflow. ### Prerequisites diff --git a/docs/guides/dynamo_deploy/gke_setup.md b/docs/guides/dynamo_deploy/gke_setup.md index 46be96c185..07c2fd6649 100644 --- a/docs/guides/dynamo_deploy/gke_setup.md +++ b/docs/guides/dynamo_deploy/gke_setup.md @@ -1,3 +1,5 @@ +:orphan: + # GKE Workload Identity and Artifact Registry Setup Guide This guide explains how to set up Workload Identity in GKE and configure access to Google Artifact Registry. diff --git a/docs/guides/dynamo_deploy/helm_install.md b/docs/guides/dynamo_deploy/helm_install.md new file mode 100644 index 0000000000..b8631d1aa7 --- /dev/null +++ b/docs/guides/dynamo_deploy/helm_install.md @@ -0,0 +1,69 @@ + + +# Manual Helm Deployment + +This directory contains Helm charts for manually deploying Dynamo inference graphs to Kubernetes. +This approach allows you to install Dynamo directly using a DynamoGraphDeploymentCRD values file, which is useful for quick deployments or testing specific configurations. + +### Prerequisites + +- Helm 3.0+ +- Kubernetes 1.16+ +- ETCD v3.5+ (without auth) +- NATS v2.10+ (with jetstream enabled) +- Grove v0.1.0+ (optional if deploying using Grove) + +### Basic Installation + +Here is how you would install a VLLM inference backend example. + +```bash +helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./components/backends/vllm/deploy/agg.yaml +``` + +### Installation using Grove + +Same example as above, but using Grove PodGangSet resources. + +```bash +helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./components/backends/vllm/deploy/agg.yaml --set deploymentType=grove +``` + +### Customizable Properties + +You can override the default configuration by setting the following properties: + +```bash +helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud \ + -f ./components/backends/vllm/deploy/agg.yaml \ + --set "imagePullSecrets[0].name=docker-secret-1" \ + --set etcdAddr="my-etcd-service:2379" \ + --set natsAddr="nats://my-nats-service:4222" +``` + +#### Available Properties + +| Property | Description | Example | +|----------|-------------|---------| +| `imagePullSecrets` | Array of image pull secrets for accessing private registries | `imagePullSecrets[0].name=docker-secret-1` | +| `etcdAddr` | Address of the etcd service | `dynamo-platform-etcd:2379` | +| `natsAddr` | Address of the NATS messaging service | `nats://dynamo-platform-nats:4222` | +| `deploymentType` | Type of deployment to use. Can be `basic` or `grove`. If not specified, `basic` is used. | `deploymentType=grove` | + + + diff --git a/docs/guides/dynamo_deploy/operator_deployment.md b/docs/guides/dynamo_deploy/operator_deployment.md deleted file mode 120000 index 80ca4341ee..0000000000 --- a/docs/guides/dynamo_deploy/operator_deployment.md +++ /dev/null @@ -1 +0,0 @@ -../../../guides/dynamo_deploy/operator_deployment.md \ No newline at end of file diff --git a/docs/guides/dynamo_deploy/quickstart.md b/docs/guides/dynamo_deploy/quickstart.md index 5639b92f87..5787df31f7 100644 --- a/docs/guides/dynamo_deploy/quickstart.md +++ b/docs/guides/dynamo_deploy/quickstart.md @@ -14,19 +14,11 @@ Use this approach when installing from pre-built helm charts and docker images p ```bash export NAMESPACE=dynamo-cloud -export RELEASE_VERSION=0.3.2 +export RELEASE_VERSION=0.4.0 ``` Install `envsubst`, `kubectl`, `helm` -### Authenticate with NGC - -Go to https://ngc.nvidia.com/org to get your NGC_CLI_API_KEY. - -```bash -helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --username='$oauthtoken' --password= -``` - ### Fetch Helm Charts ```bash @@ -67,7 +59,7 @@ Ensure you have the source code checked out and are in the `dynamo` directory: ### Set Environment Variables -Our examples use the [`nvcr.io`](https://nvcr.io/nvidia/ai-dynamo/) but you can setup your own values if you use another docker registry. +Our examples use the `nvcr.io` but you can setup your own values if you use another docker registry. ```bash export NAMESPACE=dynamo-cloud # or whatever you prefer. diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index 0453fc7ccd..9a30270dea 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -211,7 +211,7 @@ The KV-aware routing arguments: ### Request Migration -In a [Distributed System](#distributed-system), a request may fail due to connectivity issues between the HTTP Server and the Worker Engine. +In a Distributed System, a request may fail due to connectivity issues between the HTTP Server and the Worker Engine. The HTTP Server will automatically track which Worker Engines are having connectivity issues with it and avoid routing new requests to the Engines with known connectivity issues. @@ -482,11 +482,11 @@ The trtllm engine requires [etcd](https://etcd.io/) and [nats](https://nats.io/) ##### Step 1: Build the environment -See instructions [here](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/README.md#build-docker) to build the dynamo container with TensorRT-LLM. +See instructions [here](https://github.com/ai-dynamo/dynamo/tree/main/components/backends/trtllm#build-container) to build the dynamo container with TensorRT-LLM. ##### Step 2: Run the environment -See instructions [here](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/README.md#run-container) to run the built environment. +See instructions [here](https://github.com/ai-dynamo/dynamo/tree/main/components/backends/trtllm#run-container) to run the built environment. ##### Step 3: Execute `dynamo-run` command @@ -679,10 +679,6 @@ Here are some example engines: - Chat: * [sglang](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_sglang_tok.py) -More fully-featured Backend engines (used by `dynamo-run`): -- [vllm](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/vllm_inc.py) -- [sglang](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/sglang_inc.py) - ### Debugging `dynamo-run` and `dynamo-runtime` support [tokio-console](https://github.com/tokio-rs/console). Build with the feature to enable: diff --git a/docs/hidden_toctree.rst b/docs/hidden_toctree.rst index ecd0208317..32895dd738 100644 --- a/docs/hidden_toctree.rst +++ b/docs/hidden_toctree.rst @@ -1,3 +1,5 @@ +:orphan: + .. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 @@ -21,5 +23,36 @@ :maxdepth: 2 :hidden: - guides/README.md - runtime/README.md \ No newline at end of file + runtime/README.md + API/nixl_connect/connector.md + API/nixl_connect/descriptor.md + API/nixl_connect/device.md + API/nixl_connect/device_kind.md + API/nixl_connect/operation_status.md + API/nixl_connect/rdma_metadata.md + API/nixl_connect/readable_operation.md + API/nixl_connect/writable_operation.md + API/nixl_connect/read_operation.md + API/nixl_connect/write_operation.md + components/backends/sglang/deploy/README.md + components/backends/sglang/docs/dsr1-wideep-h100.md + components/backends/sglang/docs/multinode-examples.md + components/backends/sglang/docs/sgl-http-server.md + components/backends/sglang/slurm_jobs/README.md + components/router/README.md + examples/README.md + guides/dynamo_deploy/create_deployment.md + guides/dynamo_deploy/sla_planner_deployment.md + guides/dynamo_deploy/helm_install.md + guides/dynamo_deploy/gke_setup.md + guides/dynamo_deploy/README.md + guides/dynamo_run.md + components/backends/vllm/README.md + components/backends/trtllm/README.md + components/backends/trtllm/deploy/README.md + components/backends/trtllm/llama4_plus_eagle.md + components/backends/trtllm/multinode-examples.md + components/backends/trtllm/kv-cache-tranfer.md + components/backends/vllm/deploy/README.md + components/backends/vllm/multi-node.md + diff --git a/docs/index.rst b/docs/index.rst index c751f0d819..822e96b7bb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,12 +27,60 @@ The NVIDIA Dynamo Platform is a high-performance, low-latency inference framewor - `Dynamo README `_ - `Architecture and features doc `_ - `Usage guides `_ - - `Dynamo examples repo `_ + - `Dynamo examples repo `_ Quick Start ----------------- -Follow the :doc:`Quick Guide to install Dynamo Platform `. + +Local Deployment +~~~~~~~~~~~~~~~~ + +Get started with Dynamo locally in just a few commands: + +**1. Install Dynamo** + +.. code-block:: bash + + # Install uv (recommended Python package manager) + curl -LsSf https://astral.sh/uv/install.sh | sh + + # Create virtual environment and install Dynamo + uv venv venv + source venv/bin/activate + uv pip install "ai-dynamo[sglang]" # or [vllm], [trtllm] + +**2. Start etcd/NATS** + +.. code-block:: bash + + # Start etcd and NATS using Docker Compose + docker compose -f deploy/docker-compose.yml up -d + +**3. Run Dynamo** + +.. code-block:: bash + + # Start the OpenAI compatible frontend + python -m dynamo.frontend + + # In another terminal, start an SGLang worker + python -m dynamo.sglang.worker deepseek-ai/DeepSeek-R1-Distill-Llama-8B + +**4. Test your deployment** + +.. code-block:: bash + + curl localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50}' + +Kubernetes Deployment +~~~~~~~~~~~~~~~~~~~~~ + +For deployments on Kubernetes, follow the :doc:`Dynamo Platform Quickstart Guide `. Dive in: Examples @@ -92,16 +140,8 @@ The examples below assume you build the latest image yourself from source. If us :hidden: :caption: Using Dynamo - Running Inference Graphs Locally (dynamo-run) - Deploying Inference Graphs - -.. toctree:: - :hidden: - :caption: Usage Guides - Writing Python Workers in Dynamo Disaggregation and Performance Tuning - KV Cache Router Performance Tuning Working with Dynamo Kubernetes Operator .. toctree:: @@ -110,31 +150,19 @@ The examples below assume you build the latest image yourself from source. If us Dynamo Deploy Quickstart Dynamo Cloud Kubernetes Platform - Manual Helm Deployment - GKE Setup Guide + Manual Helm Deployment Minikube Setup Guide Model Caching with Fluid -.. toctree:: - :hidden: - :caption: Benchmarking - - Planner Benchmark Example - - -.. toctree:: - :hidden: - :caption: API - - NIXL Connect API - .. toctree:: :hidden: :caption: Examples Hello World LLM Deployment Examples using VLLM + LLM Deployment Examples using SGLang Multinode Examples using SGLang + Planner Benchmark Example LLM Deployment Examples using TensorRT-LLM .. toctree:: @@ -143,6 +171,7 @@ The examples below assume you build the latest image yourself from source. If us Glossary + NIXL Connect API KVBM Reading diff --git a/docs/runtime/README.md b/docs/runtime/README.md index bcd29b8c70..6c0eed0b95 100644 --- a/docs/runtime/README.md +++ b/docs/runtime/README.md @@ -110,7 +110,7 @@ Annotated { data: Some("d"), id: None, event: None, comment: None } #### Python -See the [README.md](../API/python_bindings.md) for details +See the [README.md](https://github.com/ai-dynamo/dynamo/tree/release/0.4.0/lib/runtime/lib/bindings/python) for details The Python and Rust `hello_world` client and server examples are interchangeable, so you can start the Python `server.py` and talk to it from the Rust `client`. diff --git a/docs/support_matrix.md b/docs/support_matrix.md index aefeda1192..3cf5e23969 100644 --- a/docs/support_matrix.md +++ b/docs/support_matrix.md @@ -71,7 +71,7 @@ If you are using a **GPU**, the following GPU models and architectures are suppo | **NIXL** | 0.4.0 | > [!Important] -> Β² Specific versions of TensorRT-LLM supported by Dynamo are subject to change. +> Specific versions of TensorRT-LLM supported by Dynamo are subject to change. ## Cloud Service Provider Compatibility @@ -79,9 +79,12 @@ If you are using a **GPU**, the following GPU models and architectures are suppo | **Host Operating System** | **Version** | **Architecture** | **Status** | | :------------------------ | :---------- | :--------------- | :----------- | -| **Amazon Linux** | 2023 | x86_64 | Supported | +| **Amazon Linux** | 2023 | x86_64 | SupportedΒΉ | +> [!Caution] +> ΒΉ There is a known issue with the TensorRT-LLM framework when running the AL2023 container locally with `docker run --network host ...` due to a [bug](https://github.com/mpi4py/mpi4py/discussions/491#discussioncomment-12660609) in mpi4py. To avoid this issue, replace the `--network host` flag with more precise networking configuration by mapping only the necessary ports (e.g., 4222 for nats, 2379/2380 for etcd, 8080 for frontend). + ## Build Support diff --git a/examples/README.md b/examples/README.md index 13fdfe5ad2..b408f6742a 100644 --- a/examples/README.md +++ b/examples/README.md @@ -22,6 +22,15 @@ This directory contains practical examples demonstrating how to deploy and use D > **Want to see a specific example?** > Open a [GitHub issue](https://github.com/ai-dynamo/dynamo/issues) to request an example you'd like to see, or [open a pull request](https://github.com/ai-dynamo/dynamo/pulls) if you'd like to contribute your own! +## Framework Support + +The /examples directory shows how Dynamo broadly works using major inference engines. + +If you want to see advanced, framework-specific deployment patterns and best practices, check out the [Components Workflows](../components/backends/) directory: +- **[vLLM](../components/backends/vllm/)** – vLLM-specific deployment and configuration +- **[SGLang](../components/backends/sglang/)** – SGLang integration examples and workflows +- **[TensorRT-LLM](../components/backends/trtllm/)** – TensorRT-LLM workflows and optimizations + ## Basics & Tutorials Learn fundamental Dynamo concepts through these introductory examples: @@ -29,7 +38,6 @@ Learn fundamental Dynamo concepts through these introductory examples: - **[Quickstart](basics/quickstart/README.md)** - Simple aggregated serving example with vLLM backend - **[Disaggregated Serving](basics/disaggregated_serving/README.md)** - Prefill/decode separation for enhanced performance and scalability - **[Multi-node](basics/multinode/README.md)** - Distributed inference across multiple nodes and GPUs -- **[Multimodal](basics/multimodal/README.md)** - Multimodal model deployment with E/P/D disaggregated serving ## Deployment Examples @@ -67,13 +75,4 @@ Before running any examples, ensure you have: - **Docker & Docker Compose** - For containerized services - **CUDA-compatible GPU** - For LLM inference (except hello_world, which is non-GPU aware) - **Python 3.9++** - For client scripts and utilities -- **Kubernetes cluster** - For any cloud deployment/K8s examples - -## Framework Support - -These examples show how Dynamo broadly works using major inference engines. - -If you want to see advanced, framework-specific deployment patterns and best practices, check out the [Components Workflows](../components/backends/) directory: -- **[vLLM](../components/backends/vllm/)** – vLLM-specific deployment and configuration -- **[SGLang](../components/backends/sglang/)** – SGLang integration examples and workflows -- **[TensorRT-LLM](../components/backends/trtllm/)** – TensorRT-LLM workflows and optimizations \ No newline at end of file +- **Kubernetes cluster** - For any cloud deployment/K8s examples \ No newline at end of file diff --git a/examples/basics/disaggregated_serving/README.md b/examples/basics/disaggregated_serving/README.md index dee80fcb0f..ba501c43be 100644 --- a/examples/basics/disaggregated_serving/README.md +++ b/examples/basics/disaggregated_serving/README.md @@ -37,8 +37,8 @@ docker compose -f deploy/metrics/docker-compose.yml up -d ## Components - [Frontend](../../../components/frontend/README) - HTTP API endpoint that receives requests and forwards them to the decode worker -- [vLLM Prefill Worker](../../../components/backends/vllm/README) - Specialized worker for prefill phase execution -- [vLLM Decode Worker](../../../components/backends/vllm/README) - Specialized worker that handles requests and decides between local/remote prefill +- [vLLM Prefill Worker](../../../components/backends/vllm/README.md) - Specialized worker for prefill phase execution +- [vLLM Decode Worker](../../../components/backends/vllm/README.md) - Specialized worker that handles requests and decides between local/remote prefill ```mermaid --- diff --git a/examples/basics/multimodal/README.md b/examples/basics/multimodal/README.md deleted file mode 100644 index 693bfdeb98..0000000000 --- a/examples/basics/multimodal/README.md +++ /dev/null @@ -1,480 +0,0 @@ - - -# Multimodal Deployment Examples - -This directory provides example workflows and reference implementations for deploying a multimodal model using Dynamo. - -## Use the Latest Release - -We recommend using the latest stable release of dynamo to avoid breaking changes: - -[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) - -You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: - -```bash -git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -``` - -## Multimodal Aggregated Serving - -### Components - -- workers: For aggregated serving, we have two workers, [encode_worker](components/encode_worker.py) for encoding and [decode_worker](components/decode_worker.py) for prefilling and decoding. -- processor: Tokenizes the prompt and passes it to the decode worker. -- frontend: HTTP endpoint to handle incoming requests. - -### Graph - -In this graph, we have two workers, [encode_worker](components/encode_worker.py) and [decode_worker](components/decode_worker.py). -The encode worker is responsible for encoding the image and passing the embeddings to the decode worker via a combination of NATS and RDMA. -The work complete event is sent via NATS, while the embeddings tensor is transferred via RDMA through the NIXL interface. -Its decode worker then prefills and decodes the prompt, just like the [LLM aggregated serving](../llm/README.md) example. -By separating the encode from the prefill and decode stages, we can have a more flexible deployment and scale the -encode worker independently from the prefill and decode workers if needed. - -This figure shows the flow of the graph: -```mermaid -flowchart LR - HTTP --> processor - processor --> HTTP - processor --> decode_worker - decode_worker --> processor - decode_worker --image_url--> encode_worker - encode_worker --embeddings--> decode_worker -``` - -```bash -cd $DYNAMO_HOME/examples/multimodal -# Serve a LLaVA 1.5 7B model: -dynamo serve graphs.agg:Frontend -f ./configs/agg-llava.yaml -# Serve a Qwen2.5-VL model: -# dynamo serve graphs.agg:Frontend -f ./configs/agg-qwen.yaml -# Serve a Phi3V model: -# dynamo serve graphs.agg:Frontend -f ./configs/agg-phi3v.yaml -``` - -### Client - -In another terminal: -```bash -curl http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "llava-hf/llava-1.5-7b-hf", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://images.cocodataset.org/test2017/000000155781.jpg" - } - } - ] - } - ], - "max_tokens": 300, - "temperature": 0.0, - "stream": false - }' -``` - -If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`. - -You should see a response similar to this: -```json -{"id": "c37b946e-9e58-4d54-88c8-2dbd92c47b0c", "object": "chat.completion", "created": 1747725277, "model": "llava-hf/llava-1.5-7b-hf", "choices": [{"index": 0, "message": {"role": "assistant", "content": " In the image, there is a city bus parked on a street, with a street sign nearby on the right side. The bus appears to be stopped out of service. The setting is in a foggy city, giving it a slightly moody atmosphere."}, "finish_reason": "stop"}]} -``` - -## Multimodal Disaggregated Serving - -### Components - -- workers: For disaggregated serving, we have three workers, [encode_worker](components/encode_worker.py) for encoding, [decode_worker](components/decode_worker.py) for decoding, and [prefill_worker](components/prefill_worker.py) for prefilling. -- processor: Tokenizes the prompt and passes it to the decode worker. -- frontend: HTTP endpoint to handle incoming requests. - -### Graph - -In this graph, we have three workers, [encode_worker](components/encode_worker.py), [decode_worker](components/decode_worker.py), and [prefill_worker](components/prefill_worker.py). -For the Llava model, embeddings are only required during the prefill stage. As such, the encode worker is connected directly to the prefill worker. -The encode worker is responsible for encoding the image and passing the embeddings to the prefill worker via a combination of NATS and RDMA. -Its work complete event is sent via NATS, while the embeddings tensor is transferred via RDMA through the NIXL interface. -The prefill worker performs the prefilling step and forwards the KV cache to the decode worker for decoding. -For more details on the roles of the prefill and decode workers, refer to the [LLM disaggregated serving](../llm/README.md) example. - -This figure shows the flow of the graph: -```mermaid -flowchart LR - HTTP --> processor - processor --> HTTP - processor --> decode_worker - decode_worker --> processor - decode_worker --> prefill_worker - prefill_worker --> decode_worker - prefill_worker --image_url--> encode_worker - encode_worker --embeddings--> prefill_worker -``` - -```bash -cd $DYNAMO_HOME/examples/multimodal -dynamo serve graphs.disagg:Frontend -f configs/disagg.yaml -``` - -### Client - -In another terminal: -```bash -curl http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "llava-hf/llava-1.5-7b-hf", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://images.cocodataset.org/test2017/000000155781.jpg" - } - } - ] - } - ], - "max_tokens": 300, - "temperature": 0.0, - "stream": false - }' -``` - -You should see a response similar to this: -```json -{"id": "c1774d61-3299-4aa3-bea1-a0af6c055ba8", "object": "chat.completion", "created": 1747725645, "model": "llava-hf/llava-1.5-7b-hf", "choices": [{"index": 0, "message": {"role": "assistant", "content": " This image shows a passenger bus traveling down the road near power lines and trees. The bus displays a sign that says \"OUT OF SERVICE\" on its front."}, "finish_reason": "stop"}]} -``` - -***Note***: disaggregation is currently only confirmed to work with LLaVA. Qwen VL and PhiV are not confirmed to be supported. - -## Deployment with Dynamo Operator - -These multimodal examples can be deployed to a Kubernetes cluster using [Dynamo Cloud](../../docs/guides/dynamo_deploy/dynamo_cloud.md) and the Dynamo CLI. - -### Prerequisites - -You must have first followed the instructions in [deploy/cloud/helm/README.md](../../deploy/cloud/helm/README.md) to install Dynamo Cloud on your Kubernetes cluster. - -**Note**: The `KUBE_NS` variable in the following steps must match the Kubernetes namespace where you installed Dynamo Cloud. You must also expose the `dynamo-store` service externally. This will be the endpoint the CLI uses to interface with Dynamo Cloud. - -### Deployment Steps - -For detailed deployment instructions, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md). The following are the specific commands for the multimodal examples: - -```bash -# Set your project root directory -export PROJECT_ROOT=$(pwd) - -# Configure environment variables (see operator_deployment.md for details) -export KUBE_NS=dynamo-cloud -export DYNAMO_CLOUD=http://localhost:8080 # If using port-forward -# OR -# export DYNAMO_CLOUD=https://dynamo-cloud.nvidia.com # If using Ingress/VirtualService - -# Build the Dynamo base image (see operator_deployment.md for details) -export DYNAMO_IMAGE=/: - -# TODO: Apply Dynamo graph deployment for the example -``` - -**Note**: To avoid rate limiting from unauthenticated requests to HuggingFace (HF), you can provide your `HF_TOKEN` as a secret in your deployment. See the [operator deployment guide](../../docs/guides/dynamo_deploy/operator_deployment.md#referencing-secrets-in-your-deployment) for instructions on referencing secrets like `HF_TOKEN` in your deployment configuration. - -**Note**: Optionally add `--Planner.no-operation=false` at the end of the deployment command to enable the planner component to take scaling actions on your deployment. - -### Testing the Deployment - -Once the deployment is complete, you can test it. If you have ingress available for your deployment, you can directly call the url returned -in `dynamo deployment get ${DEPLOYMENT_NAME}` and skip the steps to find and forward the frontend pod. - -```bash -# Find your frontend pod -export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} | grep "${DEPLOYMENT_NAME}-frontend" | sort -k1 | tail -n1 | awk '{print $1}') - -# Forward the pod's port to localhost -kubectl port-forward pod/$FRONTEND_POD 8080:8080 -n ${KUBE_NS} - -# Test the API endpoint -curl localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "llava-hf/llava-1.5-7b-hf", - "messages": [ - { - "role": "user", - "content": [ - { "type": "text", "text": "What is in this image?" }, - { "type": "image_url", "image_url": { "url": "http://images.cocodataset.org/test2017/000000155781.jpg" } } - ] - } - ], - "max_tokens": 300, - "temperature": 0.0, - "stream": false - }' -``` - -If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`. - -For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md). - -## Multimodal Aggregated Video Serving - -This example demonstrates deploying an aggregated multimodal model that can process video inputs. - -### Components - -- workers: For video serving, we have two workers, [video_encode_worker](components/video_encode_worker.py) for decoding video into frames, and [video_decode_worker](components/video_decode_worker.py) for prefilling and decoding. -- processor: Tokenizes the prompt and passes it to the decode worker. -- frontend: HTTP endpoint to handle incoming requests. - -### Graph - -In this graph, we have two workers, `video_encode_worker` and `video_decode_worker`. -The `video_encode_worker` is responsible for decoding the video into a series of frames. Unlike the image pipeline which generates embeddings, this pipeline passes the raw frames directly to the `video_decode_worker`. This transfer is done efficiently using RDMA. -The `video_decode_worker` then receives these frames, and performs prefill and decode steps with the model. Separating the video processing from the language model inference allows for flexible scaling. - -This figure shows the flow of the graph: -```mermaid -flowchart LR - HTTP --> processor - processor --> HTTP - processor --> video_decode_worker - video_decode_worker --> processor - video_decode_worker --video_url--> video_encode_worker - video_encode_worker --frames--> video_decode_worker -``` - -```bash -cd $DYNAMO_HOME/examples/multimodal -# Serve a LLaVA-NeXT-Video-7B model: -dynamo serve graphs.agg_video:Frontend -f ./configs/agg_video.yaml -``` - -### Client - -In another terminal: -```bash -curl -X 'POST' 'http://localhost:8080/v1/chat/completions' -H 'Content-Type: application/json' -d '{ - "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe the video in detail" - }, - { - "type": "video_url", - "video_url": { - "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4" - } - } - ] - } - ], - "max_tokens": 300, - "stream": false - }' | jq -``` - -You should see a response describing the video's content similar to -```json -{ - "id": "b5714626-5889-4bb7-8c51-f3bca65b4683", - "object": "chat.completion", - "created": 1749772533, - "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": " Sure! The video features a group of anthropomorphic animals who appear human-like. They're out in a meadow, which is a large, open area covered in grasses, and have given human qualities like speaking and a desire to go on adventures. The animals are seen play-fighting with each other clearly seen glancing at the camera when they sense it, blinking, and Roman the second can be directly heard by the camera reciting the line, \"When the challenge becomes insane, the behavior becomes erratic.\" A white rabbit is the first in shot and he winks the left eye and flips the right ear before shaking with the mouse and squirrel friends on a blurry rock ledge under the sky. At some point, the rabbit turns towards the camera and starts playing with the thing, and there's a distant mountain in the background. Furthermore, a little animal from a tree in the background flies with two rocks, and it's joined by the rest of the group of friends. That outro is an elder turtle in the Ramden musical style saturated with a horn-like thing pattern." - }, - "finish_reason": "stop" - } - ] -} -``` - -## Multimodal Disaggregated Video Serving - -This example demonstrates deploying a disaggregated multimodal model that can process video inputs. - -### Dependency - -Video example relies on `av` package for video preprocessing inside the encode_worker. -Please install `av` inside the dynamo container to enable video example. - -`pip install av` - -### Components - -- workers: For disaggregated video serving, we have three workers, [video_encode_worker](components/video_encode_worker.py) for decoding video into frames, [video_decode_worker](components/video_decode_worker.py) for decoding, and [video_prefill_worker](components/video_prefill_worker.py) for prefilling. -- processor: Tokenizes the prompt and passes it to the decode worker. -- frontend: HTTP endpoint to handle incoming requests. - -### Graph - -In this graph, we have three workers, `video_encode_worker`, `video_decode_worker`, and `video_prefill_worker`. -For the LLaVA-NeXT-Video-7B model, frames are only required during the prefill stage. As such, the `video_encode_worker` is connected directly to the `video_prefill_worker`. -The `video_encode_worker` is responsible for decoding the video into a series of frames and passing them to the `video_prefill_worker` via RDMA. -The `video_prefill_worker` performs the prefilling step and forwards the KV cache to the `video_decode_worker` for decoding. - -This figure shows the flow of the graph: -```mermaid -flowchart LR - HTTP --> processor - processor --> HTTP - processor --> video_decode_worker - video_decode_worker --> processor - video_decode_worker --> video_prefill_worker - video_prefill_worker --> video_decode_worker - video_prefill_worker --video_url--> video_encode_worker - video_encode_worker --frames--> video_prefill_worker -``` - -```bash -cd $DYNAMO_HOME/examples/multimodal -# Serve a LLaVA-NeXT-Video-7B model: -dynamo serve graphs.disagg_video:Frontend -f ./configs/disagg_video.yaml -``` - -### Client - -In another terminal: -```bash -curl -X 'POST' 'http://localhost:8080/v1/chat/completions' -H 'Content-Type: application/json' -d '{ - "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe the video in detail" - }, - { - "type": "video_url", - "video_url": { - "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4" - } - } - ] - } - ], - "max_tokens": 300, - "stream": false - }' | jq -``` - -You should see a response describing the video's content similar to -```json -{ - "id": "d1d641b1-4daf-48d3-9d06-6a60743b5a42", - "object": "chat.completion", - "created": 1749775300, - "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": " The video features two animals in a lush, green outdoor environment. On the ground, there is a white rabbit with big brown eyes, a playful expression, and two antlers. The rabbit is accompanied by a uniquely colored bird with orange pupils, possibly a squirrel or a hamster, sitting on its head. These two animals seem to have embarked on an unlikely journey, flying together in the sky. The backdrop showcases rolling green hills and trees under the pleasant weather. The sky is clear, indicating a beautiful day. The colors and contrast suggest the landscape is during spring or summer, signifying the rabbit and bird could also be engaging in outdoor activities during those seasons. Overall, it's a charming scene depicting an unlikely yet harmonious pair, enjoying a surprise adventure in nature." - }, - "finish_reason": "stop" - } - ] -} -``` - - -## Deploying Multimodal Examples on Kubernetes - -This guide will help you quickly deploy and clean up the multimodal example services in Kubernetes. - -### Prerequisites - -- **Dynamo Cloud** is already deployed in your target Kubernetes namespace. -- You have `kubectl` access to your cluster and the correct namespace set in `$NAMESPACE`. - - -### Create a secret with huggingface token - -```bash -export HF_TOKEN="huggingfacehub token with read permission to models" -kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n $KUBE_NS || true -``` - ---- - -Choose the example you want to deploy or delete. The YAML files are located in `examples/multimodal/deploy/k8s/`. - -### Deploy the Multimodal Example - -```bash -kubectl apply -f examples/multimodal/deploy/k8s/ -n $NAMESPACE -``` - -### Uninstall the Multimodal Example - - -```bash -kubectl delete -f examples/multimodal/deploy/k8s/ -n $NAMESPACE -``` - -### Using a different dynamo container - -To customize the container image used in your deployment, you will need to update the manifest before applying it. - -You can use [`yq`](https://github.com/mikefarah/yq?tab=readme-ov-file#install), a portable command-line YAML processor. - -Please follow the [installation instructions](https://github.com/mikefarah/yq?tab=readme-ov-file#install) for your platform if you do not already have `yq` installed. After installing `yq`, you can generate and apply your manifest as follows: - - -```bash -export DYNAMO_IMAGE=my-registry/my-image:tag - -yq '.spec.services.[].extraPodSpec.mainContainer.image = env(DYNAMO_IMAGE)' $EXAMPLE_FILE > my_example_manifest.yaml - -# install the dynamo example -kubectl apply -f my_example_manifest.yaml -n $NAMESPACE - -# uninstall the dynamo example -kubectl delete -f my_example_manifest.yaml -n $NAMESPACE - -``` \ No newline at end of file diff --git a/examples/basics/multinode/README.md b/examples/basics/multinode/README.md index 9959899648..fadd8af294 100644 --- a/examples/basics/multinode/README.md +++ b/examples/basics/multinode/README.md @@ -85,7 +85,7 @@ Install Dynamo with [SGLang](https://docs.sglang.ai/) support: pip install ai-dynamo[sglang] ``` -For more information about the SGLang backend and its integration with Dynamo, see the [SGLang Backend Documentation](../../components/backends/sglang/README.md). +For more information about the SGLang backend and its integration with Dynamo, see the [SGLang Backend Documentation](../../../components/backends/sglang/README.md). ### 3. Network Requirements diff --git a/examples/basics/quickstart/README.md b/examples/basics/quickstart/README.md index 694243d5d6..99dc405a0f 100644 --- a/examples/basics/quickstart/README.md +++ b/examples/basics/quickstart/README.md @@ -18,7 +18,7 @@ docker compose -f deploy/metrics/docker-compose.yml up -d ## Components - [Frontend](../../../components/frontend/README) - A built-in component that launches an OpenAI compliant HTTP server, a pre-processor, and a router in a single process -- [vLLM Backend](../../../components/backends/vllm/README) - A built-in component that runs vLLM within the Dynamo runtime +- [vLLM Backend](../../../components/backends/vllm/README.md) - A built-in component that runs vLLM within the Dynamo runtime ```mermaid --- diff --git a/examples/deployments/EKS/Deploy_VLLM_example.md b/examples/deployments/EKS/Deploy_VLLM_example.md index dd4f027da8..b395781ed5 100644 --- a/examples/deployments/EKS/Deploy_VLLM_example.md +++ b/examples/deployments/EKS/Deploy_VLLM_example.md @@ -25,8 +25,8 @@ dynamo-cloud vllm-agg-router-vllmdecodeworker-787d575485-zkwdd Test the Deployment ``` -kubectl port-forward deployment/vllm-agg-router-frontend 8080:8000 -n dynamo-cloud -curl localhost:8080/v1/chat/completions \ +kubectl port-forward deployment/vllm-agg-router-frontend 8000:8000 -n dynamo-cloud +curl localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "Qwen/Qwen3-0.6B", diff --git a/examples/runtime/hello_world/README.md b/examples/runtime/hello_world/README.md deleted file mode 100644 index 67bb331dc5..0000000000 --- a/examples/runtime/hello_world/README.md +++ /dev/null @@ -1,119 +0,0 @@ - - -# Hello World Example - -This is the simplest Dynamo example demonstrating a basic service using Dynamo's distributed runtime. It showcases the fundamental concepts of creating endpoints and workers in the Dynamo runtime system. - -## Architecture - -```text -Client (dynamo_worker) - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Backend β”‚ Dynamo endpoint (/generate) -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -## Components - -- **Backend**: A Dynamo service with an endpoint that receives text input and streams back greetings for each comma-separated word -- **Client**: A Dynamo worker that connects to and sends requests to the backend service, then prints out the response - -## Implementation Details - -The example demonstrates: - -- **Endpoint Definition**: Using the `@dynamo_endpoint` decorator to create streaming endpoints -- **Worker Setup**: Using the `@dynamo_worker()` decorator to create distributed runtime workers -- **Service Creation**: Creating services and endpoints using the distributed runtime API -- **Streaming Responses**: Yielding data for real-time streaming -- **Client Integration**: Connecting to services and processing streams -- **Logging**: Basic logging configuration with `configure_dynamo_logging` - -## Getting Started - -## Prerequisites - - Before running this example, ensure you have the following services running: - - - **etcd**: A distributed key-value store used for service discovery and metadata storage - - **NATS**: A high-performance message broker for inter-component communication - - You can start these services using Docker Compose: - - ```bash - # clone the dynamo repository if necessary - # git clone https://github.com/ai-dynamo/dynamo.git - cd dynamo - docker compose -f deploy/docker-compose.yml up -d - ``` - -### Running the Example - -First, start the backend service: -```bash -cd examples/runtime/hello_world -python hello_world.py -``` - -Second, in a separate terminal, run the client: -```bash -cd examples/runtime/hello_world -python client.py -``` - -The client will connect to the backend service and print the streaming results. - -### Expected Output - -When running the client, you should see streaming output like: -```text -Hello world! -Hello sun! -Hello moon! -Hello star! -``` - -## Code Structure - -### Backend Service (`hello_world.py`) - -- **`content_generator`**: A dynamo endpoint that processes text input and yields greetings -- **`worker`**: A dynamo worker that sets up the service, creates the endpoint, and serves it - -### Client (`client.py`) - -- **`worker`**: A dynamo worker that connects to the backend service and processes the streaming response - -## Deployment to Kubernetes - -Follow the [Quickstart Guide](../../../docs/guides/dynamo_deploy/quickstart.md) to install Dynamo Cloud. -Then deploy to kubernetes using - -```bash -export NAMESPACE= -cd dynamo -kubectl apply -f examples/runtime/hello_world/deploy/hello_world.yaml -n ${NAMESPACE} -``` - -to delete your deployment: - -```bash -kubectl delete dynamographdeployment hello-world -n ${NAMESPACE} -``` \ No newline at end of file diff --git a/examples/runtime/hello_world/README.md b/examples/runtime/hello_world/README.md new file mode 120000 index 0000000000..f12f83ab97 --- /dev/null +++ b/examples/runtime/hello_world/README.md @@ -0,0 +1 @@ +../../../docs/examples/runtime/hello_world/README.md \ No newline at end of file diff --git a/examples/runtime/hello_world/client.py b/examples/runtime/hello_world/client.py index c685dff407..1eacfcb16e 100644 --- a/examples/runtime/hello_world/client.py +++ b/examples/runtime/hello_world/client.py @@ -31,10 +31,33 @@ async def worker(runtime: DistributedRuntime): client = await endpoint.client() await client.wait_for_instances() - # Issue request and process the stream - stream = await client.generate("world,sun,moon,star") - async for response in stream: - print(response.data()) + idx = 0 + base_delay = 0.1 # Start with 100ms + max_delay = 5.0 # Max 5 seconds + current_delay = base_delay + + while True: + try: + # Issue request and process the stream + idx += 1 + stream = await client.generate(f"Query[{idx}] Hello world") + async for response in stream: + print(response.data()) + # Reset backoff on successful iteration + current_delay = base_delay + # Sleep for 1 second + await asyncio.sleep(1) + except asyncio.CancelledError: + # Re-raise for graceful shutdown + raise + except Exception as e: + # Log the exception with context + print(f"Error in worker iteration {idx}: {type(e).__name__}: {e}") + # Perform exponential backoff + print(f"Retrying after {current_delay:.2f} seconds...") + await asyncio.sleep(current_delay) + # Double the delay for next time, up to max_delay + current_delay = min(current_delay * 2, max_delay) if __name__ == "__main__": diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock index 0c60e0e802..aa2d31cdf1 100644 --- a/lib/bindings/python/Cargo.lock +++ b/lib/bindings/python/Cargo.lock @@ -1169,7 +1169,7 @@ dependencies = [ [[package]] name = "dynamo-llm" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "ahash", "akin", @@ -1235,7 +1235,7 @@ dependencies = [ [[package]] name = "dynamo-py3" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "anyhow", "async-openai", @@ -1262,7 +1262,7 @@ dependencies = [ [[package]] name = "dynamo-runtime" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "anyhow", "arc-swap", @@ -2912,9 +2912,9 @@ dependencies = [ [[package]] name = "nixl-sys" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97f621270fd1ed8af5a8028a1945e6f7e612a38836ce82b720fe54222739df3c" +checksum = "743ed1038b386b75451f9e0bba37cb2e3eea75873635268337d6531be99c9303" dependencies = [ "bindgen", "cc", diff --git a/lib/bindings/python/Cargo.toml b/lib/bindings/python/Cargo.toml index 3f631a5b24..58a84cf919 100644 --- a/lib/bindings/python/Cargo.toml +++ b/lib/bindings/python/Cargo.toml @@ -19,7 +19,7 @@ [package] name = "dynamo-py3" -version = "0.4.0" +version = "0.4.0+post0" edition = "2021" authors = ["NVIDIA"] license = "Apache-2.0" diff --git a/lib/bindings/python/pyproject.toml b/lib/bindings/python/pyproject.toml index 26647d20af..91ca9554f5 100644 --- a/lib/bindings/python/pyproject.toml +++ b/lib/bindings/python/pyproject.toml @@ -16,7 +16,7 @@ [project] name = "ai-dynamo-runtime" -dynamic = ["version"] +version = "0.4.0.post0" description = "Dynamo Inference Framework Runtime" readme = "README.md" authors = [ diff --git a/lib/bindings/python/tests/test_block_manager.py b/lib/bindings/python/tests/test_block_manager.py index 94c7b455db..ea2c21c7e1 100644 --- a/lib/bindings/python/tests/test_block_manager.py +++ b/lib/bindings/python/tests/test_block_manager.py @@ -19,7 +19,13 @@ import pytest import torch -from dynamo.llm import BlockManager +# Attempt to import the optional module +try: + from dynamo.llm import BlockManager +except ImportError: + pytest.importorskip( + "optional_module", reason="block-manager feature is not enabled" + ) pytestmark = pytest.mark.pre_merge diff --git a/lib/llm/Cargo.toml b/lib/llm/Cargo.toml index 062faf46f2..b764087194 100644 --- a/lib/llm/Cargo.toml +++ b/lib/llm/Cargo.toml @@ -89,7 +89,7 @@ rayon = "1" dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] } # block_manager -nixl-sys = {version = "0.4.0", optional = true } +nixl-sys = {version = "0.4.1", optional = true } cudarc = { version = "0.16.2", features = ["cuda-12020"], optional = true } ndarray = { version = "0.16", optional = true } nix = { version = "0.26", optional = true } diff --git a/lib/runtime/examples/Cargo.lock b/lib/runtime/examples/Cargo.lock index 51a79be245..3014ba03ea 100644 --- a/lib/runtime/examples/Cargo.lock +++ b/lib/runtime/examples/Cargo.lock @@ -683,7 +683,7 @@ dependencies = [ [[package]] name = "dynamo-runtime" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "anyhow", "arc-swap", @@ -1060,7 +1060,7 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "hello_world" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "dynamo-runtime", ] @@ -2548,7 +2548,7 @@ dependencies = [ [[package]] name = "service_metrics" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "dynamo-runtime", "futures", @@ -2724,7 +2724,7 @@ dependencies = [ [[package]] name = "system_metrics" -version = "0.4.0" +version = "0.4.0+post0" dependencies = [ "dynamo-runtime", "futures", diff --git a/lib/runtime/examples/Cargo.toml b/lib/runtime/examples/Cargo.toml index 855f203854..cb9a7cb6b3 100644 --- a/lib/runtime/examples/Cargo.toml +++ b/lib/runtime/examples/Cargo.toml @@ -22,7 +22,7 @@ members = [ resolver = "3" [workspace.package] -version = "0.4.0" +version = "0.4.0+post0" edition = "2021" authors = ["NVIDIA"] license = "Apache-2.0" diff --git a/pyproject.toml b/pyproject.toml index 32c6ff5993..2ff19a9ff7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ [project] name = "ai-dynamo" -version = "0.4.0" +version = "0.4.0.post0" description = "Distributed Inference Framework" readme = "README.md" authors = [ @@ -25,7 +25,7 @@ license = { text = "Apache-2.0" } license-files = ["LICENSE"] requires-python = ">=3.10" dependencies = [ - "ai-dynamo-runtime==0.4.0", + "ai-dynamo-runtime==0.4.0.post0", "pytest>=8.3.4", "types-psutil>=7.0.0.20250218", "kubernetes>=32.0.1,<33.0.0", @@ -61,18 +61,19 @@ Repository = "https://github.com/ai-dynamo/dynamo.git" [project.optional-dependencies] trtllm =[ "uvloop", - "tensorrt-llm==1.0.0rc4" + "tensorrt-llm==1.0.0rc4", + "triton==3.3.1", # locking triton as version 3.4.0 breaks tensorrt-llm 1.0.0rc4 ] vllm = [ "uvloop", - "nixl", + "nixl<=0.4.1", "vllm==0.10.0", ] sglang = [ "uvloop", - "nixl", + "nixl<=0.4.1", "sglang[all]==0.4.9.post6", ]