From 65cc1c20f02cc5e3dfe347d789322f1729b1ca63 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Thu, 10 Jul 2025 19:03:22 -0700 Subject: [PATCH 01/22] Refactor the TRTLLM examples remove dynamo SDK --- examples/tensorrt_llm/README.md | 352 ---------------- examples/tensorrt_llm/__init__.py | 14 - examples/tensorrt_llm/common/__init__.py | 0 examples/tensorrt_llm/common/base_engine.py | 383 ------------------ examples/tensorrt_llm/common/parser.py | 62 --- examples/tensorrt_llm/common/protocol.py | 104 ----- examples/tensorrt_llm/components/frontend.py | 119 ------ .../tensorrt_llm/components/prefill_worker.py | 75 ---- examples/tensorrt_llm/components/worker.py | 253 +++++++----- examples/tensorrt_llm/configs/agg.yaml | 33 +- examples/tensorrt_llm/configs/agg_router.yaml | 34 -- .../decode_config.yaml => decode.yaml} | 2 +- .../tensorrt_llm/configs/deepseek_r1/agg.yaml | 35 -- .../configs/deepseek_r1/disagg.yaml | 49 --- .../engine_configs/agg_config.yaml | 54 --- .../engine_configs/decode_config.yaml | 55 --- .../engine_configs/prefill_config.yaml | 37 -- .../mtp/engine_configs/agg_config.yaml | 50 --- .../mtp/engine_configs/decode_config.yaml | 53 --- .../mtp/engine_configs/prefill_config.yaml | 37 -- .../configs/deepseek_r1/mtp/mtp_agg.yaml | 36 -- .../configs/deepseek_r1/mtp/mtp_disagg.yaml | 52 --- .../configs/deepseek_r1/multinode/README.md | 273 ------------- .../multinode/engine_configs/dep16_agg.yaml | 27 -- .../multinode/engine_configs/eplb.yaml | 7 - .../multinode/engine_configs/wide_ep_agg.yaml | 35 -- .../engine_configs/wide_ep_decode.yaml | 59 --- .../engine_configs/wide_ep_prefill.yaml | 41 -- .../deepseek_r1/multinode/srun_aggregated.sh | 75 ---- .../multinode/srun_disaggregated.sh | 94 ----- .../multinode/start_frontend_services.sh | 16 - .../multinode/start_trtllm_worker.sh | 46 --- examples/tensorrt_llm/configs/disagg.yaml | 48 --- .../tensorrt_llm/configs/disagg_router.yaml | 47 --- .../configs/engine_configs/agg_config.yaml | 31 -- .../prefill_config.yaml => prefill.yaml} | 2 +- examples/tensorrt_llm/graphs/agg.py | 19 - examples/tensorrt_llm/graphs/disagg.py | 20 - examples/tensorrt_llm/launch/agg.sh | 25 ++ examples/tensorrt_llm/launch/agg_router.sh | 26 ++ examples/tensorrt_llm/launch/disagg.sh | 34 ++ .../launch/disagg_prefill_first_strategy.sh | 36 ++ examples/tensorrt_llm/launch/disagg_router.sh | 37 ++ .../tensorrt_llm/utils/clear_namespace.py | 44 ++ examples/tensorrt_llm/utils/disagg_utils.py | 64 +++ .../utils/request_handlers/handler_base.py | 145 +++++++ .../utils/request_handlers/handlers.py | 141 +++++++ examples/tensorrt_llm/utils/trtllm_utils.py | 177 ++++++++ 48 files changed, 901 insertions(+), 2557 deletions(-) delete mode 100644 examples/tensorrt_llm/README.md delete mode 100644 examples/tensorrt_llm/__init__.py delete mode 100644 examples/tensorrt_llm/common/__init__.py delete mode 100644 examples/tensorrt_llm/common/base_engine.py delete mode 100644 examples/tensorrt_llm/common/parser.py delete mode 100644 examples/tensorrt_llm/common/protocol.py delete mode 100644 examples/tensorrt_llm/components/frontend.py delete mode 100644 examples/tensorrt_llm/components/prefill_worker.py delete mode 100644 examples/tensorrt_llm/configs/agg_router.yaml rename examples/tensorrt_llm/configs/{engine_configs/decode_config.yaml => decode.yaml} (96%) delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/agg.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/engine_configs/agg_config.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/engine_configs/decode_config.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/engine_configs/prefill_config.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/agg_config.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/decode_config.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_agg.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_disagg.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/multinode/README.md delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/dep16_agg.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/eplb.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_agg.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_decode.yaml delete mode 100644 examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_prefill.yaml delete mode 100755 examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_aggregated.sh delete mode 100755 examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_disaggregated.sh delete mode 100755 examples/tensorrt_llm/configs/deepseek_r1/multinode/start_frontend_services.sh delete mode 100755 examples/tensorrt_llm/configs/deepseek_r1/multinode/start_trtllm_worker.sh delete mode 100644 examples/tensorrt_llm/configs/disagg.yaml delete mode 100644 examples/tensorrt_llm/configs/disagg_router.yaml delete mode 100644 examples/tensorrt_llm/configs/engine_configs/agg_config.yaml rename examples/tensorrt_llm/configs/{engine_configs/prefill_config.yaml => prefill.yaml} (96%) delete mode 100644 examples/tensorrt_llm/graphs/agg.py delete mode 100644 examples/tensorrt_llm/graphs/disagg.py create mode 100755 examples/tensorrt_llm/launch/agg.sh create mode 100755 examples/tensorrt_llm/launch/agg_router.sh create mode 100755 examples/tensorrt_llm/launch/disagg.sh create mode 100755 examples/tensorrt_llm/launch/disagg_prefill_first_strategy.sh create mode 100755 examples/tensorrt_llm/launch/disagg_router.sh create mode 100644 examples/tensorrt_llm/utils/clear_namespace.py create mode 100644 examples/tensorrt_llm/utils/disagg_utils.py create mode 100644 examples/tensorrt_llm/utils/request_handlers/handler_base.py create mode 100644 examples/tensorrt_llm/utils/request_handlers/handlers.py create mode 100644 examples/tensorrt_llm/utils/trtllm_utils.py diff --git a/examples/tensorrt_llm/README.md b/examples/tensorrt_llm/README.md deleted file mode 100644 index f844a56d94..0000000000 --- a/examples/tensorrt_llm/README.md +++ /dev/null @@ -1,352 +0,0 @@ - - -# LLM Deployment Examples using TensorRT-LLM - -This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using TensorRT-LLM. - -## Use the Latest Release - -We recommend using the latest stable release of dynamo to avoid breaking changes: - -[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) - -You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: - -```bash -git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) -``` - -## Deployment Architectures - -See [deployment architectures](../llm/README.md#deployment-architectures) to learn about the general idea of the architecture. -Note that this TensorRT-LLM version does not support all the options yet. - -Note: TensorRT-LLM disaggregation does not support conditional disaggregation yet. You can only configure the deployment to always use aggregate or disaggregated serving. - -## Getting Started - -1. Choose a deployment architecture based on your requirements -2. Configure the components as needed -3. Deploy using the provided scripts - -### Prerequisites - -Start required services (etcd and NATS) using [Docker Compose](../../deploy/metrics/docker-compose.yml) -```bash -docker compose -f deploy/metrics/docker-compose.yml up -d -``` - -### Build docker - -```bash -# TensorRT-LLM uses git-lfs, which needs to be installed in advance. -apt-get update && apt-get -y install git git-lfs - -# On an x86 machine: -./container/build.sh --framework tensorrtllm - -# On an ARM machine: -./container/build.sh --framework tensorrtllm --platform linux/arm64 - -# Build the container with the default experimental TensorRT-LLM commit -# WARNING: This is for experimental feature testing only. -# The container should not be used in a production environment. -./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit -``` - -### Run container - -``` -./container/run.sh --framework tensorrtllm -it -``` -## Run Deployment - -This figure shows an overview of the major components to deploy: - - - -``` - -+------+ +-----------+ +------------------+ +---------------+ -| HTTP |----->| processor |----->| Worker |------------>| Prefill | -| |<-----| |<-----| |<------------| Worker | -+------+ +-----------+ +------------------+ +---------------+ - | ^ | - query best | | return | publish kv events - worker | | worker_id v - | | +------------------+ - | +---------| kv-router | - +------------->| | - +------------------+ - -``` - -Note: The above architecture illustrates all the components. The final components -that get spawned depend upon the chosen graph. - -### Example architectures - -#### Aggregated serving -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml -``` - -#### Aggregated serving with KV Routing -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.agg:Frontend -f ./configs/agg_router.yaml -``` - -#### Disaggregated serving -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml -``` - -#### Disaggregated serving with KV Routing -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.disagg:Frontend -f ./configs/disagg_router.yaml -``` - -#### Aggregated serving with Multi-Token Prediction (MTP) and DeepSeek R1 -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.agg:Frontend -f configs/deepseek_r1/mtp/mtp_agg.yaml -``` - -Notes: -- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script. - - Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit` - -- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark. -- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates. - -#### Multi-Node Disaggregated Serving - -In the following example, we will demonstrate how to run a Disaggregated Serving -deployment across multiple nodes. For simplicity, we will demonstrate how to -deploy a single Decode worker on one node, and a single Prefill worker on the other node. -However, the instance counts, TP sizes, other configs, and responsibilities of each node -can be customized and deployed in similar ways. - -For example, to deploy Deepseek R1, you could replace the referenced example -configs (`configs/agg.yaml`, `configs/disagg.yaml`) with corresponding Deepseek R1 -example configs (`configs/deepseek_r1/agg.yaml`, `configs/deepseek_r1/disagg.yaml`). -You can find the example Deepseek R1 configs for GB200 -[here](configs/deepseek_r1), but the config settings can be customized for testing -other hardware configurations or parallelism strategies. - -This "multi-node" example demonstrates how to generally connect dynamo workers from -different nodes, but for simplicity, each worker individually fits on a single node. -For details on how to launch a worker that spans multiple nodes due to sheer model -size, or for features like large scale expert parallelism, see the -[multinode worker example](configs/deepseek_r1/multinode). - -##### Head Node - -Start nats/etcd: -```bash -# NATS data persisted to /tmp/nats/jetstream by default -nats-server -js & - -# Persist data to /tmp/etcd, otherwise defaults to ${PWD}/default.etcd if left unspecified -etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0.0.0:2379 --data-dir /tmp/etcd & - -# NOTE: Clearing out the etcd and nats jetstream data directories across runs -# helps to guarantee a clean and reproducible results. -``` - -Launch graph of Frontend and TensorRTLLMWorker (decode) on head node: - -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.agg:Frontend -f ./configs/disagg.yaml & -``` - -Notes: -- The aggregated graph (`graphs.agg`) is chosen here because it also describes - our desired deployment settings for the head node: launching the utility components - (Frontend, Processor), and only the decode worker (TensorRTLLMWorker configured with - `remote-prefill` enabled). We plan to launch the `TensorRTLLMPrefillWorker` - independently on a separate node in the next step of this demonstration. - You are free to customize the graph and configuration of components launched on - each node. -- The disaggregated config `configs/disagg.yaml` is intentionally chosen here as a - single source of truth to be used for deployments on all of our nodes, describing - the configurations for all of our components, including both decode and prefill - workers, but can be customized based on your deployment needs. - -##### Worker Node(s) - -Set environment variables pointing at the etcd/nats endpoints on the head node -so the Dynamo Distributed Runtime can orchestrate communication and -discoverability between the head node and worker nodes: -```bash -# if not head node -export HEAD_NODE_IP="" -export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" -export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" -``` - -Deploy a Prefill worker: -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f ./configs/disagg.yaml --service-name TensorRTLLMPrefillWorker & -``` - -Now you have a 2-node deployment with 1 Decode worker on the head node, and 1 Prefill worker on a worker node! - -##### Additional Notes for Multi-Node Deployments - -Notes: -- To include a router in this deployment, change the graph to one that includes the router, such as `graphs.agg_router`, - and change the config to one that includes the router, such as `configs/disagg_router.yaml` -- This step is assuming you're disaggregated serving and planning to launch prefill workers on separate nodes. - Howerver, for an aggregated deployment with additional aggregated worker replicas on other nodes, this step - remains mostly the same. The primary difference between aggregation and disaggregation for this step is - whether or not the `TensorRTLLMWorker` is configured to do `remote-prefill` or not in the config file - (ex: `configs/disagg.yaml` vs `configs/agg.yaml`). -- To apply the same concept for launching additional decode workers on worker nodes, you can - directly start them, similar to the prefill worker step above: - ```bash - # Example: deploy decode worker only - cd /workspace/examples/tensorrt_llm - dynamo serve components.worker:TensorRTLLMWorker -f ./configs/disagg.yaml --service-name TensorRTLLMWorker & - ``` -- If you see an error about MPI Spawn failing during TRTLLM Worker initialziation on a Slurm-based cluster, - try unsetting the following environment variables before launching the TRTLLM worker. If you intend to - run other slurm-based commands or processes on the same node after deploying the TRTLLM worker, you may - want to save these values into temporary variables and then restore them afterwards. - ```bash - # Workaround for error: `mpi4py.MPI.Exception: MPI_ERR_SPAWN: could not spawn processes` - unset SLURM_JOBID SLURM_JOB_ID SLURM_NODELIST - ``` - -#### Multi-Node Disaggregated Serving with Multi-Token Prediction (MTP) and DeepSeek R1 - -Most of the steps remain the same as the above example, but this time we will have `dynamo serve` point to different config files that contains the MTP configurations - -##### Head Node - -Start nats/etcd -```bash -nats-server -js & -etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0.0.0:2379 --data-dir /tmp/etcd & -``` - -Launch graph of Frontend and TensorRTLLMWorker (decode) on head node: - -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.agg:Frontend -f configs/deepseek_r1/mtp/mtp_disagg.yaml & -``` - -##### Worker Node(s) - -Set environment variables pointing at the etcd/nats endpoints on the head node. -```bash -export HEAD_NODE_IP="" -export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" -export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" -``` - -Deploy a Prefill worker: -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f configs/deepseek_r1/mtp/mtp_disagg.yaml --service-name TensorRTLLMPrefillWorker & -``` - -Notes: -- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script. - - Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit` -- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark. -- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates. - - -### Client - -See [client](../llm/README.md#client) section to learn how to send request to the deployment. - -NOTE: To send a request to a multi-node deployment, target the node which deployed the `Frontend` component. - -### Close deployment - -See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) section to learn about how to close the deployment. - -### Benchmarking - -To benchmark your deployment with GenAI-Perf, see this utility script, configuring the -`model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh) - - -### KV Cache Transfer for Disaggregated Serving - -In disaggregated serving architectures, KV cache must be transferred between prefill and decode nodes. TensorRT-LLM supports two methods for this transfer: - -#### Default Method: UCX -By default, TensorRT-LLM uses UCX (Unified Communication X) for KV cache transfer between prefill and decode nodes. UCX provides high-performance communication optimized for GPU-to-GPU transfers. - -#### Experimental Method: NIXL -TensorRT-LLM also provides experimental support for using **NIXL** (NVIDIA Inference Xfer Library) for KV cache transfer. [NIXL](https://github.com/ai-dynamo/nixl) is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments. - -**Note:** NIXL support in TensorRT-LLM is experimental and is not suitable for production environments yet. - -#### Using NIXL for KV Cache Transfer - -**Note:** NIXL backend for TensorRT-LLM is currently only supported on AMD64 (x86_64) architecture. If you're running on ARM64, you'll need to use the default UCX method for KV cache transfer. - -To enable NIXL for KV cache transfer in disaggregated serving: - -1. **Build the container with NIXL support:** - The TensorRT-LLM wheel must be built from source with NIXL support. The `./container/build.sh` script caches previously built TensorRT-LLM wheels to reduce build time. If you have previously built a TensorRT-LLM wheel without NIXL support, you must delete the cached wheel to force a rebuild with NIXL support. - - **Remove cached TensorRT-LLM wheel (only if previously built without NIXL support):** - ```bash - rm -rf /tmp/trtllm_wheel - ``` - - **Build the container with NIXL support:** - ```bash - ./container/build.sh --framework tensorrtllm \ - --use-default-experimental-tensorrtllm-commit \ - --trtllm-use-nixl-kvcache-experimental - ``` - - **Note:** Both `--use-default-experimental-tensorrtllm-commit` and `--trtllm-use-nixl-kvcache-experimental` flags are required to enable NIXL support. - -2. **Run the containerized environment:** - See [run container](#run-container) section to learn how to start the container image built in previous step. - -3. **Start the disaggregated service:** - See [disaggregated serving](#disaggregated-serving) to see how to start the deployment. - -4. **Send the request:** - See [client](#client) section to learn how to send the request to deployment. - -**Important:** Ensure that ETCD and NATS services are running before starting the service. - -The container will automatically configure the appropriate environment variables (`TRTLLM_USE_NIXL_KVCACHE=1`) when built with the NIXL flag. The same container image can be used to use UCX for KV cache transfer. -```bash -unset TRTLLM_USE_NIXL_KVCACHE -export TRTLLM_USE_UCX_KVCACHE=1 -``` - diff --git a/examples/tensorrt_llm/__init__.py b/examples/tensorrt_llm/__init__.py deleted file mode 100644 index 3159bfe656..0000000000 --- a/examples/tensorrt_llm/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/examples/tensorrt_llm/common/__init__.py b/examples/tensorrt_llm/common/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/tensorrt_llm/common/base_engine.py b/examples/tensorrt_llm/common/base_engine.py deleted file mode 100644 index 4bd7a60d40..0000000000 --- a/examples/tensorrt_llm/common/base_engine.py +++ /dev/null @@ -1,383 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from dataclasses import dataclass -from typing import Any, Optional - -from common.protocol import DisaggregatedTypeConverter, TRTLLMWorkerRequest -from tensorrt_llm import SamplingParams -from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options -from tensorrt_llm.llmapi.tokenizer import tokenizer_factory -from tensorrt_llm.serve.openai_protocol import ( - DisaggregatedParams as OAIDisaggregatedParams, -) - -from dynamo.llm import get_tensorrtllm_engine, get_tensorrtllm_publisher -from dynamo.runtime import DistributedRuntime - -logger = logging.getLogger(__name__) - -logger.setLevel(logging.DEBUG) - -# Default buffer size for kv cache events. -DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024 - - -def parse_endpoint(endpoint: str) -> tuple[str, str, str]: - endpoint_str = endpoint.replace("dyn://", "", 1) - endpoint_parts = endpoint_str.split(".") - if len(endpoint_parts) != 3: - raise ValueError( - f"Invalid endpoint format: '{endpoint}'. " - "Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'." - ) - - return (endpoint_parts[0], endpoint_parts[1], endpoint_parts[2]) - - -@dataclass -class BaseEngineConfig: - """Base engine configuration""" - - namespace: str - component: str - endpoint: str - model_path: str - served_model_name: Optional[str] = None - kv_block_size: int = 32 - extra_engine_args: str = "" - publish_events_and_metrics: bool = False - disaggregation_mode: str = "prefill_and_decode" - remote_prefill_endpoint: Optional[str] = None - lease_id: int = 0 - - def __str__(self) -> str: - return ( - f"Config(namespace={self.namespace}, " - f"component={self.component}, " - f"endpoint={self.endpoint}, " - f"model_path={self.model_path}, " - f"served_model_name={self.served_model_name}, " - f"kv_block_size={self.kv_block_size}, " - f"extra_engine_args={self.extra_engine_args}, " - f"publish_events_and_metrics={self.publish_events_and_metrics}, " - f"disaggregation_mode={self.disaggregation_mode}, " - f"remote_prefill_endpoint={self.remote_prefill_endpoint}, " - f"lease_id={self.lease_id})" - ) - - -class BaseTensorrtLLMEngine: - def __init__( - self, - config: BaseEngineConfig, - ): - self._config = config - self._prefill_client = None - self._llm_engine = None - self._llm_engine_context = None - self._llm_publisher = None - self._llm_publisher_context = None - self._runtime = None - self._first_generation = True - # Initialize default sampling params - self.default_sampling_params = SamplingParams() - - async def initialize(self, runtime: DistributedRuntime): - """Initialize the engine and prefill client if needed""" - self._runtime = runtime - - # Convert model path to Path object if it's a local path, otherwise keep as string - model_path = str(self._config.model_path) - - # Initialize the LLM engine - engine_args: dict[str, Any] = { - "model": model_path, - "tensor_parallel_size": 1, - "backend": "pytorch", - "skip_tokenizer_init": True, - } - - if self._config.extra_engine_args: - # TODO: Support extra engine args from json file as well. - engine_args = update_llm_args_with_extra_options( - engine_args, self._config.extra_engine_args - ) - # Update the model path in the config to the model path used by the engine. - self._config.model_path = str(engine_args["model"]) - if not self._config.model_path: - raise ValueError( - "Model specification is required. Present neither in the config nor in the extra engine args." - ) - - # Populate default sampling params from the model - tokenizer = tokenizer_factory(self._config.model_path) - self.default_sampling_params = SamplingParams() - self.default_sampling_params._setup(tokenizer) - self.default_sampling_params.stop = None - - if self._config.publish_events_and_metrics: - # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events. - kv_cache_config: dict[str, Any] | Any = None - if "kv_cache_config" not in engine_args: - kv_cache_config = {} - kv_cache_config[ - "event_buffer_max_size" - ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE - else: - kv_cache_config = engine_args["kv_cache_config"] - if ( - hasattr(kv_cache_config, "event_buffer_max_size") - and not kv_cache_config.event_buffer_max_size - ): - kv_cache_config.event_buffer_max_size = ( - DEFAULT_KV_EVENT_BUFFER_MAX_SIZE - ) - elif ( - isinstance(kv_cache_config, dict) - and "event_buffer_max_size" not in kv_cache_config - ): - kv_cache_config[ - "event_buffer_max_size" - ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE - engine_args["kv_cache_config"] = kv_cache_config - - # Enable iter perf stats by default if we are publishing events and metrics. - if not engine_args.get("enable_iter_perf_stats"): - engine_args["enable_iter_perf_stats"] = True - - # Only pytorch backend is supported for now to publish events and metrics. - if engine_args.get("backend") != "pytorch": - logging.error( - "Only pytorch backend is supported for now to publish events and metrics." - ) - raise RuntimeError( - "Only pytorch backend is supported for now to publish events and metrics. Hence, KV router is not supported." - ) - - logging.info(f"TRTLLM engine args: {engine_args}") - - # Get the engine using the asynccontextmanager - self._llm_engine_context = get_tensorrtllm_engine(engine_args) - if self._llm_engine_context is not None: - self._llm_engine = await self._llm_engine_context.__aenter__() - else: - raise RuntimeError("Failed to create LLM engine context") - - if ( - self._config.publish_events_and_metrics - and self._config.disaggregation_mode != "prefill" - ): - kv_listener = runtime.namespace(self._config.namespace).component( - self._config.component - ) - self._llm_publisher_context = get_tensorrtllm_publisher( - kv_listener, - self._llm_engine, - kv_listener, - self._config.lease_id, - self._config.kv_block_size, - ) - if self._llm_publisher_context is not None: - self._llm_publisher = await self._llm_publisher_context.__aenter__() - else: - raise RuntimeError("Failed to create LLM publisher context") - - # Initialize prefill client if in decode mode - if self._config.disaggregation_mode == "decode": - if self._config.remote_prefill_endpoint is None: - raise ValueError("remote_prefill_endpoint is required for decode mode") - logging.info( - f"Initializing remote prefill client for endpoint: {self._config.remote_prefill_endpoint}" - ) - ( - parsed_namespace, - parsed_component_name, - parsed_endpoint_name, - ) = parse_endpoint(self._config.remote_prefill_endpoint) - if self._runtime is not None: - self._prefill_client = ( - await self._runtime.namespace(parsed_namespace) - .component(parsed_component_name) - .endpoint(parsed_endpoint_name) - .client() - ) - else: - raise RuntimeError("Runtime not initialized") - - async def cleanup(self): - """Cleanup resources""" - if self._llm_publisher_context: - try: - await self._llm_publisher_context.__aexit__(None, None, None) - except Exception as e: - logging.error(f"Error during publisher cleanup: {e}") - finally: - self._llm_publisher = None - self._llm_publisher_context = None - - if self._llm_engine_context: - try: - await self._llm_engine_context.__aexit__(None, None, None) - except Exception as e: - logging.error(f"Error during engine cleanup: {e}") - finally: - self._llm_engine = None - self._llm_engine_context = None - - self._prefill_client = None - - async def remote_prefill(self, request: TRTLLMWorkerRequest): - """ - Send a prefill request to the remote prefill worker. - - Args: - request: The original request to be sent for prefill - - Returns: - The response from the remote prefill worker - - Raises: - ValueError: If prefill client is not initialized or multiple responses received - """ - prefill_request = request.model_copy(deep=True) - # TRTLLM requires max_tokens to be set for prefill requests. - prefill_request.stop_conditions.max_tokens = 1 - prefill_request.disaggregated_params = OAIDisaggregatedParams( - request_type="context_only" - ) - - if self._prefill_client is None: - raise ValueError("Prefill client not initialized") - try: - # TODO: Use smart KV router to determine which prefill worker to use. This would also require supporting publishing events for prefill workers. - remote_prefill_responses = [ - remote_prefill_response - async for remote_prefill_response in await self._prefill_client.round_robin( - prefill_request.model_dump_json() - ) - ] - except Exception as e: - raise ValueError(f"Error in remote prefill: {e}") - - if len(remote_prefill_responses) > 1: - raise ValueError( - "Prefill worker returned more than one response. This is currently not supported in remote prefill mode." - ) - - if len(remote_prefill_responses) == 0: - raise ValueError("No response received from remote prefill worker") - - remote_prefill_response = remote_prefill_responses[0] - return remote_prefill_response - - async def generate(self, request: TRTLLMWorkerRequest): - if self._llm_engine is None: - raise RuntimeError("Engine not initialized") - - if self._llm_publisher: - publishers_error = self._llm_publisher.check_error_queue() - if publishers_error: - raise publishers_error - - inputs = request.token_ids - - # Decode the disaggregated params from the request - disaggregated_params = DisaggregatedTypeConverter.to_llm_disaggregated_params( - request.disaggregated_params - ) - num_output_tokens_so_far = 0 - - if self._config.disaggregation_mode == "decode": - # Run prefill/context phase remotely if disaggregation mode is decode. - try: - prefill_result = await self.remote_prefill(request) - except Exception as e: - raise ValueError(f"Error in remote prefill: {e}") - - remote_prefill_response = prefill_result.data() - if ( - remote_prefill_response["finish_reason"] == "stop" - or remote_prefill_response["finish_reason"] == "error" - ): - yield remote_prefill_response - return - - # Decode the disaggregated params from the remote prefill response - # Decode the disaggregated params from the remote prefill response - disaggregated_params = ( - DisaggregatedTypeConverter.to_llm_disaggregated_params( - OAIDisaggregatedParams( - **remote_prefill_response["disaggregated_params"] - ) - ) - ) - - # Set the disaggregated params to generation_only for the rest of the generation - disaggregated_params.request_type = "generation_only" - - sampling_params = self.default_sampling_params - for key, value in request.sampling_options.model_dump().items(): - if not value: - continue - if hasattr(sampling_params, key): - setattr(sampling_params, key, value) - - max_tokens = request.stop_conditions.max_tokens - if max_tokens: - sampling_params.max_tokens = max_tokens - - ignore_eos = request.stop_conditions.ignore_eos - if ignore_eos: - sampling_params.ignore_eos = ignore_eos - - # TODO: Disable streaming for context only requests when adding disagg support - async for res in self._llm_engine.llm.generate_async( - inputs=inputs, - sampling_params=sampling_params, - disaggregated_params=disaggregated_params, - streaming=(self._config.disaggregation_mode != "prefill"), - ): - # TRTLLM engine needs to start generating tokens first before stats - # can be retrieved. - if self._first_generation and self._llm_publisher: - self._llm_publisher.start() - self._first_generation = False - - if res.finished and self._config.disaggregation_mode != "prefill": - yield {"finish_reason": "stop", "token_ids": []} - break - - if not res.outputs: - yield {"finish_reason": "error", "token_ids": []} - break - - output = res.outputs[0] - next_total_toks = len(output.token_ids) - out = {"token_ids": output.token_ids[num_output_tokens_so_far:]} - if output.finish_reason: - out["finish_reason"] = output.finish_reason - if output.stop_reason: - out["stop_reason"] = output.stop_reason - if self._config.disaggregation_mode == "prefill": - # Return the disaggregated params only when operating in prefill mode. - out[ - "disaggregated_params" - ] = DisaggregatedTypeConverter.to_oai_disaggregated_params( - output.disaggregated_params - ).model_dump() - - yield out - num_output_tokens_so_far = next_total_toks diff --git a/examples/tensorrt_llm/common/parser.py b/examples/tensorrt_llm/common/parser.py deleted file mode 100644 index 67bb230796..0000000000 --- a/examples/tensorrt_llm/common/parser.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - - -def parse_tensorrt_llm_args( - config_args, -) -> argparse.Namespace: - parser = argparse.ArgumentParser(description="A TensorRT-LLM Worker parser") - parser.add_argument( - "--extra-engine-args", - type=str, - default="", - help="Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.", - ) - parser.add_argument( - "--model-path", - type=str, - default=None, - help="Path to disk model or HuggingFace model identifier to load.", - ) - parser.add_argument( - "--served_model_name", - type=str, - help="Name to serve the model under.", - ) - parser.add_argument( - "--router", - type=str, - choices=["random", "round-robin", "kv"], - default="random", - help="Router type to use for scheduling requests to workers", - ) - - parser.add_argument( - "--kv-block-size", - type=int, - default=32, - help="Number of tokens per KV block in TRTLLM worker. Default is 32 for pytorch backend.", - ) - - parser.add_argument( - "--enable-disagg", - action="store_true", - help="Enable remote prefill for the worker", - ) - - args = parser.parse_args(config_args) - return args diff --git a/examples/tensorrt_llm/common/protocol.py b/examples/tensorrt_llm/common/protocol.py deleted file mode 100644 index f05cdb9f8f..0000000000 --- a/examples/tensorrt_llm/common/protocol.py +++ /dev/null @@ -1,104 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import base64 -from typing import List, Optional - -from pydantic import BaseModel, Field -from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams -from tensorrt_llm.serve.openai_protocol import DisaggregatedParams - - -class Tokens(BaseModel): - tokens: list[int] - - -TokenIdType = int - - -class DisaggregatedTypeConverter: - @staticmethod - def to_llm_disaggregated_params( - disaggregated_params: DisaggregatedParams, - ) -> LlmDisaggregatedParams: - if disaggregated_params is None: - return None - else: - opaque_state = ( - base64.b64decode(disaggregated_params.encoded_opaque_state) - if disaggregated_params.encoded_opaque_state is not None - else None - ) - - return LlmDisaggregatedParams( - request_type=disaggregated_params.request_type, - first_gen_tokens=disaggregated_params.first_gen_tokens, - ctx_request_id=disaggregated_params.ctx_request_id, - opaque_state=opaque_state, - ) - - @staticmethod - def to_oai_disaggregated_params( - tllm_disagg_params: LlmDisaggregatedParams, - ) -> DisaggregatedParams: - if tllm_disagg_params is None: - return None - else: - encoded_opaque_state = ( - base64.b64encode(tllm_disagg_params.opaque_state).decode("utf-8") - if tllm_disagg_params.opaque_state is not None - else None - ) - return DisaggregatedParams( - request_type=tllm_disagg_params.request_type, - first_gen_tokens=tllm_disagg_params.first_gen_tokens, - ctx_request_id=tllm_disagg_params.ctx_request_id, - encoded_opaque_state=encoded_opaque_state, - ) - - -# TODO: move these to common for all LLMs once we adopt dynamo-run -# derived from lib/llm/src/protocols/common/preprocessor.rs -class StopConditions(BaseModel): - max_tokens: Optional[int] = None - stop: Optional[List[str]] = None - stop_token_ids_hidden: Optional[List[TokenIdType]] = None - min_tokens: Optional[int] = None - ignore_eos: Optional[bool] = None - - -class SamplingOptions(BaseModel): - n: Optional[int] = None - best_of: Optional[int] = None - presence_penalty: Optional[float] = None - frequency_penalty: Optional[float] = None - repetition_penalty: Optional[float] = None - temperature: Optional[float] = None - top_p: Optional[float] = None - top_k: Optional[int] = None - min_p: Optional[float] = None - use_beam_search: Optional[bool] = None - length_penalty: Optional[float] = None - seed: Optional[int] = None - - -class TRTLLMWorkerRequest(BaseModel): - token_ids: List[TokenIdType] - stop_conditions: StopConditions - sampling_options: SamplingOptions - eos_token_ids: List[TokenIdType] = Field(default_factory=list) - mdc_sum: Optional[str] = None - annotations: List[str] = Field(default_factory=list) - estimated_prefix_hit_num_blocks: Optional[int] = None - disaggregated_params: Optional[DisaggregatedParams] = Field(default=None) diff --git a/examples/tensorrt_llm/components/frontend.py b/examples/tensorrt_llm/components/frontend.py deleted file mode 100644 index 98be2dfa33..0000000000 --- a/examples/tensorrt_llm/components/frontend.py +++ /dev/null @@ -1,119 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import subprocess -from pathlib import Path - -from components.worker import TensorRTLLMWorker -from fastapi import FastAPI -from pydantic import BaseModel - -from dynamo import sdk -from dynamo.sdk import depends, service -from dynamo.sdk.lib.config import ServiceConfig -from dynamo.sdk.lib.image import DYNAMO_IMAGE - -logger = logging.getLogger(__name__) - - -def get_dynamo_run_binary(): - """Find the dynamo-run binary path in SDK or fallback to 'dynamo-run' command.""" - sdk_path = Path(sdk.__file__) - binary_path = sdk_path.parent / "cli/bin/dynamo-run" - if not binary_path.exists(): - return "dynamo-run" - else: - return str(binary_path) - - -class FrontendConfig(BaseModel): - """Configuration for the Frontend service including model and HTTP server settings.""" - - served_model_name: str - endpoint: str - port: int = 8000 - router: str = "round-robin" - block_size: int = 32 - - -# todo this should be called ApiServer -@service( - dynamo={ - "namespace": "dynamo", - }, - workers=1, - image=DYNAMO_IMAGE, - app=FastAPI(title="TensorRT-LLM Example"), -) -class Frontend: - worker = depends(TensorRTLLMWorker) - - def __init__(self): - """Initialize Frontend service with HTTP server and model configuration.""" - self.frontend_config = FrontendConfig( - **ServiceConfig.get_parsed_config("Frontend") - ) - self.process = None - - logger.warning(f"Frontend config: {self.frontend_config}") - - self.start_ingress_and_processor() - - def start_ingress_and_processor(self): - """Starting dynamo-run based ingress and processor""" - logger.info( - f"Starting HTTP server and processor on port {self.frontend_config.port}" - ) - dynamo_run_binary = get_dynamo_run_binary() - - cmd = [ - dynamo_run_binary, - "in=http", - "out=dyn", - "--http-port", - str(self.frontend_config.port), - "--router-mode", - self.frontend_config.router, - ] - - logger.info(f"Frontend cmd: {cmd}") - - self.process = subprocess.Popen( - cmd, - stdout=None, - stderr=None, - ) - - def close(self): - """Clean up resources by terminating the subprocess.""" - if self.process is not None: - try: - logger.info("Terminating subprocess...") - self.process.terminate() - # Wait for process to terminate with a timeout - self.process.wait(timeout=5) - except subprocess.TimeoutExpired: - logger.warning("Subprocess did not terminate gracefully, forcing kill") - self.process.kill() - self.process.wait() - except Exception as e: - logger.error(f"Error while terminating subprocess: {e}") - finally: - self.process = None - - def __del__(self): - """Destructor to ensure subprocess is cleaned up.""" - self.close() diff --git a/examples/tensorrt_llm/components/prefill_worker.py b/examples/tensorrt_llm/components/prefill_worker.py deleted file mode 100644 index 7e43d1fca7..0000000000 --- a/examples/tensorrt_llm/components/prefill_worker.py +++ /dev/null @@ -1,75 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging - -from common.base_engine import BaseEngineConfig, BaseTensorrtLLMEngine -from common.parser import parse_tensorrt_llm_args -from common.protocol import TRTLLMWorkerRequest - -from dynamo.sdk import async_on_start, dynamo_context, endpoint, on_shutdown, service -from dynamo.sdk.lib.config import ServiceConfig - -logger = logging.getLogger(__name__) - - -@service( - dynamo={ - "namespace": "dynamo", - }, - resources={"gpu": 1, "cpu": "10", "memory": "20Gi"}, - workers=1, -) -class TensorRTLLMPrefillWorker(BaseTensorrtLLMEngine): - def __init__(self): - logger.info("Initializing TensorRT-LLM Prefill Worker") - class_name = self.__class__.__name__ - config = ServiceConfig.get_instance() - config_args = config.as_args(class_name, prefix="") - args = parse_tensorrt_llm_args(config_args) - lease_id = dynamo_context["endpoints"][0].lease_id() - namespace, _ = TensorRTLLMPrefillWorker.dynamo_address() # type: ignore - - engine_config = BaseEngineConfig( - namespace=namespace, - component=class_name, - endpoint="generate", - model_path=args.model_path, - served_model_name=args.served_model_name, - kv_block_size=args.kv_block_size, - extra_engine_args=args.extra_engine_args, - publish_events_and_metrics=False, - disaggregation_mode="prefill", - remote_prefill_endpoint=None, - lease_id=lease_id, - ) - - super().__init__(config=engine_config) - - @async_on_start - async def async_init(self): - runtime = dynamo_context["runtime"] - await self.initialize(runtime) - logger.info("TensorRT-LLM Prefill Worker initialized") - - @on_shutdown - async def async_cleanup(self): - logger.info("Cleaning up TensorRT-LLM Prefill Worker") - await self.cleanup() - logger.info("TensorRT-LLM Prefill Worker cleanup completed") - - @endpoint() - async def generate(self, request: TRTLLMWorkerRequest): - async for response in super().generate(request): - yield response diff --git a/examples/tensorrt_llm/components/worker.py b/examples/tensorrt_llm/components/worker.py index 9074bfbe8d..c2ac35d528 100644 --- a/examples/tensorrt_llm/components/worker.py +++ b/examples/tensorrt_llm/components/worker.py @@ -1,115 +1,172 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. + +import asyncio import logging +import os +import signal +import sys + +# Add the parent directory to the Python path so we can import utils +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from common.base_engine import BaseEngineConfig, BaseTensorrtLLMEngine -from common.parser import parse_tensorrt_llm_args -from common.protocol import TRTLLMWorkerRequest -from components.prefill_worker import TensorRTLLMPrefillWorker - -from dynamo.llm import ModelType, register_llm -from dynamo.sdk import ( - async_on_start, - depends, - dynamo_context, - endpoint, - on_shutdown, - service, +import uvloop +from tensorrt_llm import SamplingParams +from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options +from tensorrt_llm.llmapi.tokenizer import tokenizer_factory +from utils.request_handlers.handlers import RequestHandlerConfig, RequestHandlerFactory +from utils.trtllm_utils import Config, cmd_line_args, is_first_worker, parse_endpoint + +from dynamo.llm import ( + ModelType, + get_tensorrtllm_engine, + get_tensorrtllm_publisher, + register_llm, ) -from dynamo.sdk.lib.config import ServiceConfig +from dynamo.runtime import DistributedRuntime, dynamo_worker +from dynamo.runtime.logging import configure_dynamo_logging -logger = logging.getLogger(__name__) +# Default buffer size for kv cache events. +DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024 +configure_dynamo_logging() + + +async def graceful_shutdown(runtime): + logging.info("Received shutdown signal, shutting down DistributedRuntime") + runtime.shutdown() + logging.info("DistributedRuntime shutdown complete") -@service( - dynamo={ - "namespace": "dynamo", - }, - resources={"gpu": 1, "cpu": "10", "memory": "20Gi"}, - workers=1, -) -class TensorRTLLMWorker(BaseTensorrtLLMEngine): - prefill_worker = depends(TensorRTLLMPrefillWorker) - - def __init__(self): - logger.info("Initializing TensorRT-LLM Worker") - class_name = self.__class__.__name__ - config = ServiceConfig.get_instance() - config_args = config.as_args(class_name, prefix="") - args = parse_tensorrt_llm_args(config_args) - lease_id = dynamo_context["endpoints"][0].lease_id() - namespace, _ = TensorRTLLMWorker.dynamo_address() # type: ignore - endpoint_name = "generate" - publish_events_and_metrics = args.router == "kv" - prefill_class_name = "TensorRTLLMPrefillWorker" - - if args.enable_disagg: - disaggregation_mode = "decode" - else: - disaggregation_mode = "prefill_and_decode" - - engine_config = BaseEngineConfig( - namespace=namespace, - component=class_name, - endpoint=endpoint_name, - model_path=args.model_path, - served_model_name=args.served_model_name, - kv_block_size=args.kv_block_size, - extra_engine_args=args.extra_engine_args, - publish_events_and_metrics=publish_events_and_metrics, - disaggregation_mode=disaggregation_mode, - remote_prefill_endpoint=f"dyn://{namespace}.{prefill_class_name}.generate", - lease_id=lease_id, - ) - super().__init__(config=engine_config) +@dynamo_worker(static=False) +async def worker(runtime: DistributedRuntime): + # Set up signal handler for graceful shutdown + loop = asyncio.get_running_loop() - @async_on_start - async def async_init(self): - runtime = dynamo_context["runtime"] - await self.initialize(runtime) + def signal_handler(): + # Schedule the shutdown coroutine instead of calling it directly + asyncio.create_task(graceful_shutdown(runtime)) - logger.info("Registering LLM for discovery") - endpoint = ( - runtime.namespace(self._config.namespace) - .component(self._config.component) - .endpoint(self._config.endpoint) + for sig in (signal.SIGTERM, signal.SIGINT): + loop.add_signal_handler(sig, signal_handler) + + logging.info("Signal handlers set up for graceful shutdown") + + config = cmd_line_args() + await init(runtime, config) + + +async def init(runtime: DistributedRuntime, config: Config): + """ + Instantiate and serve + """ + logging.info(f"Initializing the worker with config: {config}") + + next_client = None + if config.next_endpoint: + logging.info( + f"Initializing next worker client for endpoint: {config.next_endpoint}" + ) + parsed_namespace, parsed_component_name, parsed_endpoint_name = parse_endpoint( + config.next_endpoint ) + next_client = ( + await runtime.namespace(parsed_namespace) + .component(parsed_component_name) + .endpoint(parsed_endpoint_name) + .client() + ) + + component = runtime.namespace(config.namespace).component(config.component) + await component.create_service() - try: + # Convert model path to Path object if it's a local path, otherwise keep as string + model_path = str(config.model_path) + + arg_map = { + "model": model_path, + "tensor_parallel_size": config.tensor_parallel_size, + "backend": "pytorch", + "skip_tokenizer_init": True, + } + if config.extra_engine_args != "": + # TODO: Support extra engine args from json file as well. + arg_map = update_llm_args_with_extra_options(arg_map, config.extra_engine_args) + if config.publish_events_and_metrics: + # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events. + kv_cache_config = None + if "kv_cache_config" not in arg_map: + kv_cache_config = {} + kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE + else: + kv_cache_config = arg_map["kv_cache_config"] + if not kv_cache_config.event_buffer_max_size: + kv_cache_config.event_buffer_max_size = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE + arg_map["kv_cache_config"] = kv_cache_config + + # Only pytorch backend is supported for now to publish events and metrics. + if "backend" not in arg_map: + arg_map["backend"] = "pytorch" + elif arg_map["backend"] != "pytorch": + logging.error( + "Only pytorch backend is supported for now to publish events and metrics." + ) + sys.exit(1) + + logging.info(f"TensorRT-LLM engine args: {arg_map}") + engine_args = arg_map + + # Populate default sampling params from the model + tokenizer = tokenizer_factory(arg_map["model"]) + default_sampling_params = SamplingParams() + default_sampling_params._setup(tokenizer) + default_sampling_params.stop = None + + async with get_tensorrtllm_engine(engine_args) as engine: + endpoint = component.endpoint(config.endpoint) + + if is_first_worker(config): + # Register the model with the endpoint if only the worker is first in the disaggregation chain. await register_llm( ModelType.Backend, endpoint, - self._config.model_path, - self._config.served_model_name, - kv_cache_block_size=self._config.kv_block_size, + config.model_path, + config.served_model_name, + kv_cache_block_size=config.kv_block_size, ) - logger.info("Successfully registered LLM for discovery") - except Exception as e: - logger.error(f"Failed to register LLM for discovery: {e}") - raise - - logger.info("TensorRT-LLM Worker initialized") - - @on_shutdown - async def async_cleanup(self): - logger.info("Cleaning up TensorRT-LLM Worker") - await self.cleanup() - logger.info("TensorRT-LLM Worker cleanup completed") - - @endpoint() - async def generate(self, request: TRTLLMWorkerRequest): - async for response in super().generate(request): - yield response + + # publisher will be set later if publishing is enabled. + handler_config = RequestHandlerConfig( + component=component, + engine=engine, + default_sampling_params=default_sampling_params, + publisher=None, + disaggregation_mode=config.disaggregation_mode, + disaggregation_strategy=config.disaggregation_strategy, + next_client=next_client, + ) + + if config.publish_events_and_metrics and is_first_worker(config): + # Initialize and pass in the publisher to the request handler to + # publish events and metrics. + kv_listener = runtime.namespace(config.namespace).component( + config.component + ) + async with get_tensorrtllm_publisher( + component, + engine, + kv_listener, + int(endpoint.lease_id()), + config.kv_block_size, + ) as publisher: + handler_config.publisher = publisher + handler = RequestHandlerFactory().get_request_handler(handler_config) + await endpoint.serve_endpoint(handler.generate) + else: + handler = RequestHandlerFactory().get_request_handler(handler_config) + await endpoint.serve_endpoint(handler.generate) + + +if __name__ == "__main__": + uvloop.install() + asyncio.run(worker()) diff --git a/examples/tensorrt_llm/configs/agg.yaml b/examples/tensorrt_llm/configs/agg.yaml index a3d4594ed8..02b5cd8463 100644 --- a/examples/tensorrt_llm/configs/agg.yaml +++ b/examples/tensorrt_llm/configs/agg.yaml @@ -12,23 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +enable_attention_dp: false +max_num_tokens: 8192 +max_batch_size: 16 +trust_remote_code: true +backend: pytorch +enable_chunked_prefill: true -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: round-robin +kv_cache_config: + free_gpu_memory_fraction: 0.95 -TensorRTLLMWorker: - # Path to disk model or HuggingFace model identifier to load - model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Name to serve the model under - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/engine_configs/agg_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 1 \ No newline at end of file +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 +use_cuda_graph: true diff --git a/examples/tensorrt_llm/configs/agg_router.yaml b/examples/tensorrt_llm/configs/agg_router.yaml deleted file mode 100644 index 58f2a82ab3..0000000000 --- a/examples/tensorrt_llm/configs/agg_router.yaml +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: kv - -TensorRTLLMWorker: - # Path to disk model or HuggingFace model identifier to load - model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Name to serve the model under - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/engine_configs/agg_config.yaml" - router: kv - ServiceArgs: - workers: 1 - resources: - gpu: 1 \ No newline at end of file diff --git a/examples/tensorrt_llm/configs/engine_configs/decode_config.yaml b/examples/tensorrt_llm/configs/decode.yaml similarity index 96% rename from examples/tensorrt_llm/configs/engine_configs/decode_config.yaml rename to examples/tensorrt_llm/configs/decode.yaml index eb943fd6e7..6492c9b9e0 100644 --- a/examples/tensorrt_llm/configs/engine_configs/decode_config.yaml +++ b/examples/tensorrt_llm/configs/decode.yaml @@ -23,5 +23,5 @@ enable_chunked_prefill: true disable_overlap_scheduler: false use_cuda_graph: true kv_cache_config: - free_gpu_memory_fraction: 0.95 + free_gpu_memory_fraction: 0.40 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/agg.yaml b/examples/tensorrt_llm/configs/deepseek_r1/agg.yaml deleted file mode 100644 index f7cec35e7d..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/agg.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - # This is the client-facing model name, you can set this to anything you'd like. - served_model_name: "nvidia/DeepSeek-R1-FP4" - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: round-robin - -TensorRTLLMWorker: - served_model_name: "nvidia/DeepSeek-R1-FP4" - # NOTE: FP4 only supported starting with Blackwell GPUs. - # https://huggingface.co/nvidia/DeepSeek-R1-FP4 - # You can also specify the full path to locally downloaded weights - # instead of a HuggingFace ID here. - model-path: "nvidia/DeepSeek-R1-FP4" - extra-engine-args: "configs/deepseek_r1/engine_configs/agg_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 4 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml b/examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml deleted file mode 100644 index 9d96befbe5..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/disagg.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - # This is the client-facing model name, you can set this to anything you'd like. - served_model_name: "nvidia/DeepSeek-R1-FP4" - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: round-robin - -TensorRTLLMWorker: - served_model_name: "nvidia/DeepSeek-R1-FP4" - # NOTE: FP4 only supported starting with Blackwell GPUs. - # https://huggingface.co/nvidia/DeepSeek-R1-FP4 - # You can also specify the full path to locally downloaded weights - # instead of a HuggingFace ID here. - model-path: "nvidia/DeepSeek-R1-FP4" - extra-engine-args: "configs/deepseek_r1/engine_configs/decode_config.yaml" - enable-disagg: true - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 4 - -TensorRTLLMPrefillWorker: - # NOTE: FP4 only supported starting with Blackwell GPUs. - # https://huggingface.co/nvidia/DeepSeek-R1-FP4 - # You can also specify the full path to locally downloaded weights - # instead of a HuggingFace ID here. - model-path: "nvidia/DeepSeek-R1-FP4" - extra-engine-args: "configs/deepseek_r1/engine_configs/prefill_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 4 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/agg_config.yaml b/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/agg_config.yaml deleted file mode 100644 index 29dddba56f..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/agg_config.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -backend: pytorch - -# TP/EP/PP/DP -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -pipeline_parallel_size: 1 -enable_attention_dp: false - -max_batch_size: 256 -# 8448 = 8192 ISL + 256 OSL -max_num_tokens: 8448 -max_seq_len: 8448 - -kv_cache_config: - # With dp attention disabled: high free_gpu_memory_fraction is fine. - free_gpu_memory_fraction: 0.85 - # With dp attention enabled: large ISL at high concurrency may need - # free_gpu_memory_fraction low to have enough available memory. - # free_gpu_memory_fraction: 0.30 - -# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 -# NOTE: overlap_scheduler enabled by default since this commit and changed -# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': -# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -use_cuda_graph: true -cuda_graph_padding_enabled: true -# NOTE: For larger max batch size, you may want to add larger cuda graph -# batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/decode_config.yaml b/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/decode_config.yaml deleted file mode 100644 index 772b94b283..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/decode_config.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -backend: pytorch - -# TP/EP/PP/DP -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -pipeline_parallel_size: 1 -enable_attention_dp: false - -max_batch_size: 256 -max_num_tokens: 256 -# 8448 = 8192 ISL + 256 OSL -max_seq_len: 8448 - -kv_cache_config: - # With dp attention disabled: high free_gpu_memory_fraction is fine. - free_gpu_memory_fraction: 0.85 - # With dp attention enabled: large ISL at high concurrency may need - # free_gpu_memory_fraction low to have enough available memory. - # free_gpu_memory_fraction: 0.30 - -# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 -# NOTE: overlap_scheduler enabled by default since this commit and changed -# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': -# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -disable_overlap_scheduler: false -use_cuda_graph: true -cuda_graph_padding_enabled: true -# NOTE: For larger max batch size, you may want to add larger cuda graph -# batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/prefill_config.yaml b/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/prefill_config.yaml deleted file mode 100644 index 6ae899a68a..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/engine_configs/prefill_config.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -backend: pytorch - -# TP/EP/PP/DP -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -pipeline_parallel_size: 1 -enable_attention_dp: true - -max_batch_size: 1 -max_num_tokens: 8192 -max_seq_len: 8192 - -kv_cache_config: - free_gpu_memory_fraction: 0.75 - -# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 -# NOTE: overlap_scheduler enabled by default since this commit and changed -# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': -# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -disable_overlap_scheduler: true -print_iter_log: true -# NOTE: This dtype must match in both prefill/decode configs -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/agg_config.yaml b/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/agg_config.yaml deleted file mode 100644 index f0b5411221..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/agg_config.yaml +++ /dev/null @@ -1,50 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# NOTE: FP4 only supported starting with Blackwell GPUs. -# https://huggingface.co/nvidia/DeepSeek-R1-FP4 -# You can also specify the full path to locally downloaded weights -# instead of a HuggingFace ID here. - -backend: pytorch -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -enable_attention_dp: true -max_batch_size: 256 -# 8448 = 8192 ISL + 256 OSL -max_num_tokens: 8448 -max_seq_len: 8448 -kv_cache_config: - free_gpu_memory_fraction: 0.30 - -# Enable the MTP(Multi-Token Prediction) in the model engine -speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/decode_config.yaml b/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/decode_config.yaml deleted file mode 100644 index ab48b2e78b..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/decode_config.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# NOTE: FP4 only supported starting with Blackwell GPUs. -# https://huggingface.co/nvidia/DeepSeek-R1-FP4 -# You can also specify the full path to locally downloaded weights -# instead of a HuggingFace ID here. - -backend: pytorch -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -enable_attention_dp: false -max_batch_size: 256 -# Note: When MPT is enabled and `cuda_graph_batch_sizes` is specified, `max_num_tokens` must satisfy the following formula: -# max_num_tokens >= max(cuda_graph_batch_sizes) * (num_nextn_predict_layers + 1) -# This is a known issue in TensorRT-LLM and will be resolved in the next release. -max_num_tokens: 512 -# 8704 = 8192 ISL + 512 OSL -max_seq_len: 8704 -kv_cache_config: - free_gpu_memory_fraction: 0.85 - -# Enable the MTP(Multi-Token Prediction) in decode model engine -speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml b/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml deleted file mode 100644 index ee6ee26a94..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# NOTE: FP4 only supported starting with Blackwell GPUs. -# https://huggingface.co/nvidia/DeepSeek-R1-FP4 -# You can also specify the full path to locally downloaded weights -# instead of a HuggingFace ID here. - -backend: pytorch -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -enable_attention_dp: true -max_batch_size: 1 -max_num_tokens: 8192 -max_seq_len: 8192 -kv_cache_config: - free_gpu_memory_fraction: 0.75 -print_iter_log: true -kv_cache_dtype: fp8 -disable_overlap_scheduler: true - -# Enable the MTP(Multi-Token Prediction) in the prefill model engine -speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_agg.yaml b/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_agg.yaml deleted file mode 100644 index c51abf9d95..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_agg.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - served_model_name: "nvidia/DeepSeek-R1-FP4" - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: round-robin - -TensorRTLLMWorker: - served_model_name: "nvidia/DeepSeek-R1-FP4" - # NOTE: FP4 only supported starting with Blackwell GPUs. - # https://huggingface.co/nvidia/DeepSeek-R1-FP4 - # You can also specify the full path to locally downloaded weights - # instead of a HuggingFace ID here. - model-path: "nvidia/DeepSeek-R1-FP4" - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/agg_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 4 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_disagg.yaml b/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_disagg.yaml deleted file mode 100644 index 5fe2679809..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/mtp/mtp_disagg.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - served_model_name: "nvidia/DeepSeek-R1-FP4" - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: round-robin - -TensorRTLLMWorker: - served_model_name: "nvidia/DeepSeek-R1-FP4" - # NOTE: FP4 only supported starting with Blackwell GPUs. - # https://huggingface.co/nvidia/DeepSeek-R1-FP4 - # You can also specify the full path to locally downloaded weights - # instead of a HuggingFace ID here. - model-path: "nvidia/DeepSeek-R1-FP4" - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/decode_config.yaml" - router: round-robin - enable-disagg: true - ServiceArgs: - workers: 1 - resources: - gpu: 4 - -TensorRTLLMPrefillWorker: - # NOTE: FP4 only supported starting with Blackwell GPUs. - # https://huggingface.co/nvidia/DeepSeek-R1-FP4 - # You can also specify the full path to locally downloaded weights - # instead of a HuggingFace ID here. - model-path: "nvidia/DeepSeek-R1-FP4" - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 4 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/README.md b/examples/tensorrt_llm/configs/deepseek_r1/multinode/README.md deleted file mode 100644 index adf19b80ef..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/README.md +++ /dev/null @@ -1,273 +0,0 @@ - - -# Example: Multi-node TRTLLM Workers with Dynamo on Slurm - -To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16), -the set of nodes need to be launched together in the same MPI world, such as -via `mpirun` or `srun`. This is true regardless of whether the worker is -aggregated, prefill-only, or decode-only. - -In this document we will demonstrate two examples launching multinode workers -on a slurm cluster with `srun`: -1. Deploying an aggregated nvidia/DeepSeek-R1 model as a multi-node TP16/EP16 - worker across 4 GB200 nodes -2. Deploying a disaggregated nvidia/DeepSeek-R1 model with a multi-node - TP16/EP16 prefill worker (4 nodes) and a multi-node TP16/EP16 decode - worker (4 nodes) across a total of 8 GB200 nodes. - -NOTE: Some of the scripts used in this example like `start_frontend_services.sh` and -`start_trtllm_worker.sh` should be translatable to other environments like Kubernetes, or -using `mpirun` directly, with relative ease. - -## Setup - -For simplicity of the example, we will make some assumptions about your slurm cluster: -1. First, we assume you have access to a slurm cluster with multiple GPU nodes - available. For functional testing, most setups should be fine. For performance - testing, you should aim to allocate groups of nodes that are performantly - inter-connected, such as those in an NVL72 setup. -2. Second, we assume this slurm cluster has the [Pyxis](https://github.com/NVIDIA/pyxis) - SPANK plugin setup. In particular, the `srun_aggregated.sh` script in this - example will use `srun` arguments like `--container-image`, - `--container-mounts`, and `--container-env` that are added to `srun` by Pyxis. - If your cluster supports similar container based plugins, you may be able to - modify the script to use that instead. -3. Third, we assume you have already built a recent Dynamo+TRTLLM container image as - described [here](https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker). - This is the image that can be set to the `IMAGE` environment variable in later steps. -4. Fourth, we assume you pre-allocate a group of nodes using `salloc`. We - will allocate 8 nodes below as a reference command to have enough capacity - to run both examples. If you plan to only run the aggregated example, you - will only need 4 nodes. If you customize the configurations to require a - different number of nodes, you can adjust the number of allocated nodes - accordingly. Pre-allocating nodes is technically not a requirement, - but it makes iterations of testing/experimenting easier. - - Make sure to set your `PARTITION` and `ACCOUNT` according to your slurm cluster setup: - ```bash - # Set partition manually based on your slurm cluster's partition names - PARTITION="" - # Set account manually if this command doesn't work on your cluster - ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)" - salloc \ - --partition="${PARTITION}" \ - --account="${ACCOUNT}" \ - --job-name="${ACCOUNT}-dynamo.trtllm" \ - -t 05:00:00 \ - --nodes 8 - ``` -5. Lastly, we will assume you are inside an interactive shell on one of your allocated - nodes, which may be the default behavior after executing the `salloc` command above - depending on the cluster setup. If not, then you should SSH into one of the allocated nodes. - -### Environment Variable Setup - -This example aims to automate as much of the environment setup as possible, -but all slurm clusters and environments are different, and you may need to -dive into the scripts to make modifications based on your specific environment. - -Assuming you have already allocated your nodes via `salloc`, and are -inside an interactive shell on one of the allocated nodes, set the -following environment variables based: -```bash -# NOTE: IMAGE must be set manually for now -# To build an iamge, see the steps here: -# https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker -export IMAGE="" - -# MOUNTS are the host:container path pairs that are mounted into the containers -# launched by each `srun` command. -# -# If you want to reference files, such as $MODEL_PATH below, in a -# different location, you can customize MOUNTS or specify additional -# comma-separated mount pairs here. -# -# NOTE: Currently, this example assumes that the local bash scripts and configs -# referenced are mounted into into /mnt inside the container. If you want to -# customize the location of the scripts, make sure to modify `srun_aggregated.sh` -# accordingly for the new locations of `start_frontend_services.sh` and -# `start_trtllm_worker.sh`. -# -# For example, assuming your cluster had a `/lustre` directory on the host, you -# could add that as a mount like so: -# -# export MOUNTS="${PWD}:/mnt,/lustre:/lustre" -export MOUNTS="${PWD}:/mnt" - -# NOTE: In general, Deepseek R1 is very large, so it is recommended to -# pre-download the model weights and save them in some shared location, -# NFS storage, HF_CACHE, etc. and modify the `--model-path` below -# to reuse the pre-downloaded weights instead. -# -# On Blackwell systems (ex: GB200), it is recommended to use the FP4 weights: -# https://huggingface.co/nvidia/DeepSeek-R1-FP4 -# -# On Hopper systems, FP4 isn't supported so you'll need to use the default weights: -# https://huggingface.co/deepseek-ai/DeepSeek-R1 -export MODEL_PATH="nvidia/DeepSeek-R1-FP4" - -# The name the model will be served/queried under, matching what's -# returned by the /v1/models endpoint. -# -# By default this is inferred from MODEL_PATH, but when using locally downloaded -# model weights, it can be nice to have explicit control over the name. -export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" -``` - -## Aggregated WideEP - -Assuming you have at least 4 nodes allocated following the setup steps above, -follow these steps below to launch an **aggregated** deployment across 4 nodes: - -```bash -# Default set in srun_aggregated.sh, but can customize here. -# export ENGINE_CONFIG="/mnt/engine_configs/wide_ep_agg.yaml" - -# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG -# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of -# total GPUs necessary to satisfy the requested parallelism. For example, -# 4 nodes x 4 gpus/node = 16 gpus total for TP16/EP16. -# export NUM_NODES=4 - -# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this. -# export NUM_GPUS_PER_NODE=4 - -# Launches: -# - frontend + etcd/nats on current (head) node -# - one large aggregated trtllm worker across multiple nodes via MPI tasks -./srun_aggregated.sh -``` - -## Disaggregated WideEP - -Assuming you have at least 8 nodes allocated (4 for prefill, 4 for decode) -following the setup above, follow these steps below to launch a **disaggregated** -deployment across 8 nodes: - -> [!Tip] -> Make sure you have a fresh environment and don't still have the aggregated -> example above still deployed on the same set of nodes. - -```bash -# Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/wide_ep_prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/wide_ep_decode.yaml" - -# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG -# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG -# The products of NUM_PREFILL_NODES*NUM_GPUS_PER_NODE and -# NUM_DECODE_NODES*NUM_GPUS_PER_NODE should match the respective number of -# GPUs necessary to satisfy the requested parallelism in each config. -# export NUM_PREFILL_NODES=4 -# export NUM_DECODE_NODES=4 - -# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this. -# export NUM_GPUS_PER_NODE=4 - -# Launches: -# - frontend + etcd/nats on current (head) node. -# - one large prefill trtllm worker across multiple nodes via MPI tasks -# - one large decode trtllm worker across multiple nodes via MPI tasks -./srun_disaggregated.sh -``` - -## Understanding the Output - -1. The `srun_aggregated.sh` launches two `srun` jobs. The first launches - etcd, NATS, and the OpenAI frontend on the head node only - called "node1" in the example output below. The second launches - a single TP16 Dynamo+TRTLLM worker spread across 4 nodes, each node - using 4 GPUs each. - ``` - # Frontend/etcd/nats services - srun: launching StepId=453374.17 on host node1, 1 tasks: 0 - ... - # TP16 TRTLLM worker split across 4 nodes with 4 gpus each - srun: launching StepId=453374.18 on host node1, 4 tasks: [0-3] - srun: launching StepId=453374.18 on host node2, 4 tasks: [4-7] - srun: launching StepId=453374.18 on host node3, 4 tasks: [8-11] - srun: launching StepId=453374.18 on host node4, 4 tasks: [12-15] - ``` -2. The OpenAI frontend will listen for and dynamically discover workers as - they register themselves with Dynamo's distributed runtime: - ``` - 0: 2025-06-13T02:36:48.160Z INFO dynamo_run::input::http: Watching for remote model at models - 0: 2025-06-13T02:36:48.161Z INFO dynamo_llm::http::service::service_v2: Starting HTTP service on: 0.0.0.0:8000 address="0.0.0.0:8000" - ``` -3. The TRTLLM worker will consist of N (N=16 for TP16) MPI ranks, 1 rank on each - GPU on each node, which will each output their progress while loading the model. - You can see each rank's output prefixed with the rank at the start of each log line - until the model succesfully finishes loading: - ``` - 8: rank8 run mgmn worker node with mpi_world_size: 16 ... - 10: rank10 run mgmn worker node with mpi_world_size: 16 ... - 9: rank9 run mgmn worker node with mpi_world_size: 16 ... - 11: rank11 run mgmn worker node with mpi_world_size: 16 ... - ... - 15: Model init total -- 55.42s - 11: Model init total -- 55.91s - 12: Model init total -- 55.24s - ``` -4. After the model fully finishes loading on all ranks, the worker will register itself, - and the OpenAI frontend will detect it, signaled by this output: - ``` - 0: 2025-06-13T02:46:35.040Z INFO dynamo_llm::discovery::watcher: added model model_name="nvidia/DeepSeek-R1-FP4" - ``` -5. At this point, with the worker fully initialized and detected by the frontend, - it is now ready for inference. -6. For `srun_disaggregated.sh`, it follows a very similar flow, but instead launches - three srun jobs instead of two. One for frontend, one for prefill worker, - and one for decode worker. - -## Example Request - -To verify the deployed model is working, send a `curl` request: -```bash -# NOTE: $HOST assumes running on head node, but can be changed to $HEAD_NODE_IP instead. -HOST=localhost -PORT=8000 -# "model" here should match the model name returned by the /v1/models endpoint -curl -w "%{http_code}" ${HOST}:${PORT}/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${SERVED_MODEL_NAME}'", - "messages": [ - { - "role": "user", - "content": "Tell me a story as if we were playing dungeons and dragons." - } - ], - "stream": true, - "max_tokens": 30 -}' -``` - -## Cleanup - -To cleanup background `srun` processes launched by `srun_aggregated.sh` or -`srun_disaggregated.sh`, you can run: -```bash -pkill srun -``` - -## Known Issues - -- This example has only been tested on a 4xGB200 node setup with 16 GPUs using - FP4 weights. In theory, the example should work on alternative setups such as - H100 nodes with FP8 weights, but this hasn't been tested yet. -- WideEP configs in this directory are still being tested. A WideEP specific - example with documentation will be added once ready. -- There are known issues where WideEP workers may not cleanly shut down: - - This may lead to leftover shared memory files in `/dev/shm/moe_*`. For - now, you must manually clean these up before deploying again on the - same set of nodes. - - Similarly, there may be GPU memory left in-use after killing the `srun` - jobs. After cleaning up any leftover shared memory files as described - above, the GPU memory may slowly come back. You can run `watch nvidia-smi` - to check on this behavior. If you don't free the GPU memory before the - next deployment, you may get a CUDA OOM error while loading the model. - - There is mention of this issue in the relevant TRT-LLM blog - [here](https://github.com/NVIDIA/TensorRT-LLM/blob/6021a439ab9c29f4c46f721eeb59f6b992c425ea/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#miscellaneous). diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/dep16_agg.yaml b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/dep16_agg.yaml deleted file mode 100644 index d697caacfa..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/dep16_agg.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Example of a Multi-node worker, but no WideEP or EPLB. -# See wide_ep*.yaml for WideEP example configs. -backend: pytorch -tensor_parallel_size: 16 -moe_expert_parallel_size: 16 -enable_attention_dp: true -max_batch_size: 256 -max_num_tokens: 256 -max_seq_len: 8448 -kv_cache_config: - free_gpu_memory_fraction: 0.7 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/eplb.yaml b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/eplb.yaml deleted file mode 100644 index f2fe0a13c6..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/eplb.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# moe_load_balancer settings for TRTLLM based on: -# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer -num_slots: 288 -layer_updates_per_iter: 2 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_agg.yaml b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_agg.yaml deleted file mode 100644 index 5bbc66bd69..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_agg.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -backend: pytorch - -# WideEP related settings -moe_backend: WideEP -# moe_max_num_tokens will default to max_num_tokens if left unspecified. -# -# If you want to set this value explicitly, one recommendation is below: -# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size -# 4096 = 256 * 16 -# moe_max_num_tokens: 4096 -moe_load_balancer: /mnt/engine_configs/eplb.yaml -tensor_parallel_size: 16 -moe_expert_parallel_size: 16 - -enable_attention_dp: true -max_batch_size: 256 -max_num_tokens: 256 -max_seq_len: 8448 -kv_cache_config: - free_gpu_memory_fraction: 0.7 -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_decode.yaml b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_decode.yaml deleted file mode 100644 index ac7fc7e8f6..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_decode.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -backend: pytorch - -# WideEP related settings -moe_backend: WideEP -moe_load_balancer: /mnt/engine_configs/eplb.yaml - -# TP/EP/PP/DP -tensor_parallel_size: 16 -moe_expert_parallel_size: 16 -pipeline_parallel_size: 1 -enable_attention_dp: true - -max_batch_size: 256 -max_num_tokens: 256 -# 8448 = 8192 ISL + 256 OSL -max_seq_len: 8448 - -kv_cache_config: - # With dp attention disabled: high free_gpu_memory_fraction is fine. - # free_gpu_memory_fraction: 0.85 - # With dp attention enabled: large ISL at high concurrency may need - # free_gpu_memory_fraction low to have enough available memory. - free_gpu_memory_fraction: 0.30 - -# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 -# NOTE: overlap_scheduler enabled by default since this commit and changed -# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': -# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -disable_overlap_scheduler: false -use_cuda_graph: true -cuda_graph_padding_enabled: true -# NOTE: For larger max batch size, you may want to add larger cuda graph -# batch sizes below to match. -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_prefill.yaml b/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_prefill.yaml deleted file mode 100644 index 06968a3a76..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/engine_configs/wide_ep_prefill.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -backend: pytorch - -# WideEP related settings -moe_backend: WideEP -moe_load_balancer: /mnt/engine_configs/eplb.yaml - -# TP/EP/PP/DP -tensor_parallel_size: 16 -moe_expert_parallel_size: 16 -pipeline_parallel_size: 1 -enable_attention_dp: true - -max_batch_size: 1 -max_num_tokens: 8192 -max_seq_len: 8192 - -kv_cache_config: - free_gpu_memory_fraction: 0.75 - -# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 -# NOTE: overlap_scheduler enabled by default since this commit and changed -# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': -# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -disable_overlap_scheduler: true -print_iter_log: true -# NOTE: This dtype must match in both prefill/decode configs -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_aggregated.sh b/examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_aggregated.sh deleted file mode 100755 index 5a632551b9..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_aggregated.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# This is one of the only variables that must be set currently, most of the rest may -# just work out of the box if following the steps in the README. -IMAGE="${IMAGE:-""}" - -# Set to mount current host directory to /mnt inside the container as an example, -# but you may freely customize the mounts based on your cluster. A common practice -# is to mount paths to NFS storage for common scripts, model weights, etc. -# NOTE: This can be a comma separated list of multiple mounts as well. -DEFAULT_MOUNT="${PWD}:/mnt" -MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" - -# Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes. -# For 8xH100 nodes as an example, you may set this to 2 nodes x 8 gpus/node instead. -NUM_NODES=${NUM_NODES:-4} -NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} - -export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/engine_configs/wide_ep_agg.yaml}" - -# Automate settings of certain variables for convenience, but you are free -# to manually set these for more control as well. -ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)" -export HEAD_NODE="${SLURMD_NODENAME}" -export HEAD_NODE_IP="$(hostname -i)" -export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" -export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" - -if [[ -z ${IMAGE} ]]; then - echo "ERROR: You need to set the IMAGE environment variable to the " \ - "Dynamo+TRTLLM docker image or .sqsh file from 'enroot import' " \ - "See how to build one from source here: " \ - "https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker" - exit 1 -fi - -# NOTE: Output streamed to stdout for ease of understanding the example, but -# in practice you would probably set `srun --output ... --error ...` to pipe -# the stdout/stderr to files. -echo "Launching frontend services in background." -srun \ - --overlap \ - --container-image "${IMAGE}" \ - --container-mounts "${MOUNTS}" \ - --verbose \ - --label \ - -A "${ACCOUNT}" \ - -J "${ACCOUNT}-dynamo.trtllm" \ - --nodelist "${HEAD_NODE}" \ - --nodes 1 \ - --jobid "${SLURM_JOB_ID}" \ - /mnt/start_frontend_services.sh & - -# NOTE: Output streamed to stdout for ease of understanding the example, but -# in practice you would probably set `srun --output ... --error ...` to pipe -# the stdout/stderr to files. -echo "Launching multi-node worker in background." -# No --task for the worker defaults to aggregated mode -TASK="" \ -srun \ - --mpi pmix \ - --oversubscribe \ - --container-image "${IMAGE}" \ - --container-mounts "${MOUNTS}" \ - --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,TASK,ENGINE_CONFIG \ - --verbose \ - --label \ - -A "${ACCOUNT}" \ - -J "${ACCOUNT}-dynamo.trtllm" \ - --nodes "${NUM_NODES}" \ - --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ - --jobid "${SLURM_JOB_ID}" \ - /mnt/start_trtllm_worker.sh & diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_disaggregated.sh b/examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_disaggregated.sh deleted file mode 100755 index 32cb4993a9..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/srun_disaggregated.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# This is one of the only variables that must be set currently, most of the rest may -# just work out of the box if following the steps in the README. -IMAGE="${IMAGE:-""}" - -# Set to mount current host directory to /mnt inside the container as an example, -# but you may freely customize the mounts based on your cluster. A common practice -# is to mount paths to NFS storage for common scripts, model weights, etc. -# NOTE: This can be a comma separated list of multiple mounts as well. -DEFAULT_MOUNT="${PWD}:/mnt" -MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" - -NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} - -NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4} -PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/wide_ep_prefill.yaml}" - -NUM_DECODE_NODES=${NUM_DECODE_NODES:-4} -DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/wide_ep_decode.yaml}" - -# Automate settings of certain variables for convenience, but you are free -# to manually set these for more control as well. -ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)" -export HEAD_NODE="${SLURMD_NODENAME}" -export HEAD_NODE_IP="$(hostname -i)" -export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" -export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" - -if [[ -z ${IMAGE} ]]; then - echo "ERROR: You need to set the IMAGE environment variable to the " \ - "Dynamo+TRTLLM docker image or .sqsh file from 'enroot import' " \ - "See how to build one from source here: " \ - "https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker" - exit 1 -fi - -# NOTE: Output streamed to stdout for ease of understanding the example, but -# in practice you would probably set `srun --output ... --error ...` to pipe -# the stdout/stderr to files. -echo "Launching frontend services in background." -srun \ - --overlap \ - --container-image "${IMAGE}" \ - --container-mounts "${MOUNTS}" \ - --verbose \ - --label \ - -A "${ACCOUNT}" \ - -J "${ACCOUNT}-dynamo.trtllm" \ - --nodelist "${HEAD_NODE}" \ - --nodes 1 \ - --jobid "${SLURM_JOB_ID}" \ - /mnt/start_frontend_services.sh & - -# NOTE: Output streamed to stdout for ease of understanding the example, but -# in practice you would probably set `srun --output ... --error ...` to pipe -# the stdout/stderr to files. -echo "Launching multi-node prefill worker in background." -TASK=prefill \ -ENGINE_CONFIG=${PREFILL_ENGINE_CONFIG} \ -srun \ - --mpi pmix \ - --oversubscribe \ - --container-image "${IMAGE}" \ - --container-mounts "${MOUNTS}" \ - --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,TASK,ENGINE_CONFIG \ - --verbose \ - --label \ - -A "${ACCOUNT}" \ - -J "${ACCOUNT}-dynamo.trtllm" \ - --nodes "${NUM_PREFILL_NODES}" \ - --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ - --jobid "${SLURM_JOB_ID}" \ - /mnt/start_trtllm_worker.sh & - -echo "Launching multi-node decode worker in background." -TASK=decode \ -ENGINE_CONFIG=${DECODE_ENGINE_CONFIG} \ -srun \ - --mpi pmix \ - --oversubscribe \ - --container-image "${IMAGE}" \ - --container-mounts "${MOUNTS}" \ - --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,TASK,ENGINE_CONFIG \ - --verbose \ - --label \ - -A "${ACCOUNT}" \ - -J "${ACCOUNT}-dynamo.trtllm" \ - --nodes "${NUM_DECODE_NODES}" \ - --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ - --jobid "${SLURM_JOB_ID}" \ - /mnt/start_trtllm_worker.sh & diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/start_frontend_services.sh b/examples/tensorrt_llm/configs/deepseek_r1/multinode/start_frontend_services.sh deleted file mode 100755 index 0d1b588904..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/start_frontend_services.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Start NATS -nats-server -js & - -# Start etcd -etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0.0.0:2379 --data-dir /tmp/etcd & - -# Wait for NATS/etcd to startup -sleep 3 - -# Start OpenAI Frontend which will dynamically discover workers when they startup -# NOTE: This is a blocking call. -dynamo-run in=http out=dyn --http-port 8000 diff --git a/examples/tensorrt_llm/configs/deepseek_r1/multinode/start_trtllm_worker.sh b/examples/tensorrt_llm/configs/deepseek_r1/multinode/start_trtllm_worker.sh deleted file mode 100755 index 257b3b1127..0000000000 --- a/examples/tensorrt_llm/configs/deepseek_r1/multinode/start_trtllm_worker.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -if [[ -z ${MODEL_PATH} ]]; then - echo "ERROR: MODEL_PATH was not set." - echo "ERROR: MODEL_PATH must be set to either the HuggingFace ID or locally " \ - "downloaded path to the model weights. Since Deepseek R1 is large, it is " \ - "recommended to pre-download them to a shared location and provide the path." - exit 1 -fi - -if [[ -z ${SERVED_MODEL_NAME} ]]; then - echo "WARNING: SERVED_MODEL_NAME was not set. It will be derived from MODEL_PATH." -fi - - - -if [[ -z ${ENGINE_CONFIG} ]]; then - echo "ERROR: ENGINE_CONFIG was not set." - echo "ERROR: ENGINE_CONFIG must be set to a valid Dynamo+TRTLLM engine config file." - exit 1 -fi - -EXTRA_ARGS="" -if [[ -n ${TASK} ]]; then - EXTRA_ARGS+="--task ${TASK}" -fi - -# NOTE: When this script is run directly from srun, the environment variables -# for TRTLLM KV cache are not set. So we need to set them here. -# Related issue: https://github.com/ai-dynamo/dynamo/issues/1743 -if [[ -z ${TRTLLM_USE_UCX_KVCACHE} ]] && [[ -z ${TRTLLM_USE_NIXL_KVCACHE} ]]; then - export TRTLLM_USE_UCX_KVCACHE=1 -fi - -# NOTE: trtllm_inc.py is a standalone python script that launches a Dynamo+TRTLLM -# worker and registers itself with the runtime. It is currently easier to wrap -# this standalone script with `trtllm-llmapi-launch` for MPI handling purposes, -# but this may be refactored into 'dynamo serve' in the future. -trtllm-llmapi-launch \ - python3 /workspace/launch/dynamo-run/src/subprocess/trtllm_inc.py \ - --model-path "${MODEL_PATH}" \ - --model-name "${SERVED_MODEL_NAME}" \ - --extra-engine-args "${ENGINE_CONFIG}" \ - ${EXTRA_ARGS} diff --git a/examples/tensorrt_llm/configs/disagg.yaml b/examples/tensorrt_llm/configs/disagg.yaml deleted file mode 100644 index 454e1640e6..0000000000 --- a/examples/tensorrt_llm/configs/disagg.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: round-robin - -TensorRTLLMWorker: - # Path to disk model or HuggingFace model identifier to load - model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Name to serve the model under - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/engine_configs/decode_config.yaml" - enable-disagg: true - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 1 - -TensorRTLLMPrefillWorker: - # Path to disk model or HuggingFace model identifier to load - model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/engine_configs/prefill_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 1 - diff --git a/examples/tensorrt_llm/configs/disagg_router.yaml b/examples/tensorrt_llm/configs/disagg_router.yaml deleted file mode 100644 index faae7f65a3..0000000000 --- a/examples/tensorrt_llm/configs/disagg_router.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: kv - -TensorRTLLMWorker: - # Path to disk model or HuggingFace model identifier to load - model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Name to serve the model under - served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/engine_configs/decode_config.yaml" - enable-disagg: true - router: kv - ServiceArgs: - workers: 1 - resources: - gpu: 1 - -TensorRTLLMPrefillWorker: - # Path to disk model or HuggingFace model identifier to load - model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/engine_configs/prefill_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 1 \ No newline at end of file diff --git a/examples/tensorrt_llm/configs/engine_configs/agg_config.yaml b/examples/tensorrt_llm/configs/engine_configs/agg_config.yaml deleted file mode 100644 index 02b5cd8463..0000000000 --- a/examples/tensorrt_llm/configs/engine_configs/agg_config.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -tensor_parallel_size: 1 -moe_expert_parallel_size: 1 -enable_attention_dp: false -max_num_tokens: 8192 -max_batch_size: 16 -trust_remote_code: true -backend: pytorch -enable_chunked_prefill: true - -kv_cache_config: - free_gpu_memory_fraction: 0.95 - -# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 -# NOTE: overlap_scheduler enabled by default since this commit and changed -# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': -# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 -use_cuda_graph: true diff --git a/examples/tensorrt_llm/configs/engine_configs/prefill_config.yaml b/examples/tensorrt_llm/configs/prefill.yaml similarity index 96% rename from examples/tensorrt_llm/configs/engine_configs/prefill_config.yaml rename to examples/tensorrt_llm/configs/prefill.yaml index 5dee9e653d..0e7ad45fed 100644 --- a/examples/tensorrt_llm/configs/engine_configs/prefill_config.yaml +++ b/examples/tensorrt_llm/configs/prefill.yaml @@ -25,4 +25,4 @@ disable_overlap_scheduler: true use_cuda_graph: false kv_cache_config: - free_gpu_memory_fraction: 0.95 + free_gpu_memory_fraction: 0.40 diff --git a/examples/tensorrt_llm/graphs/agg.py b/examples/tensorrt_llm/graphs/agg.py deleted file mode 100644 index e79f5f315c..0000000000 --- a/examples/tensorrt_llm/graphs/agg.py +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from components.frontend import Frontend -from components.worker import TensorRTLLMWorker - -Frontend.link(TensorRTLLMWorker) diff --git a/examples/tensorrt_llm/graphs/disagg.py b/examples/tensorrt_llm/graphs/disagg.py deleted file mode 100644 index 58bde05d9a..0000000000 --- a/examples/tensorrt_llm/graphs/disagg.py +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from components.frontend import Frontend -from components.prefill_worker import TensorRTLLMPrefillWorker -from components.worker import TensorRTLLMWorker - -Frontend.link(TensorRTLLMWorker).link(TensorRTLLMPrefillWorker) diff --git a/examples/tensorrt_llm/launch/agg.sh b/examples/tensorrt_llm/launch/agg.sh new file mode 100755 index 0000000000..29fded4664 --- /dev/null +++ b/examples/tensorrt_llm/launch/agg.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Setup cleanup trap +cleanup() { + echo "Cleaning up background processes..." + kill $DYNAMO_PID 2>/dev/null || true + wait $DYNAMO_PID 2>/dev/null || true + echo "Cleanup complete." +} +trap cleanup EXIT INT TERM + +# run clear_namespace +python3 utils/clear_namespace.py --namespace dynamo + +# run ingress +dynamo run in=http out=dyn --http-port=8000 & +DYNAMO_PID=$! + +# run worker +python3 components/worker.py \ + --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ + --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ + --extra-engine-args configs/agg.yaml diff --git a/examples/tensorrt_llm/launch/agg_router.sh b/examples/tensorrt_llm/launch/agg_router.sh new file mode 100755 index 0000000000..9aaa7e1711 --- /dev/null +++ b/examples/tensorrt_llm/launch/agg_router.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Setup cleanup trap +cleanup() { + echo "Cleaning up background processes..." + kill $DYNAMO_PID 2>/dev/null || true + wait $DYNAMO_PID 2>/dev/null || true + echo "Cleanup complete." +} +trap cleanup EXIT INT TERM + +# run clear_namespace +python3 utils/clear_namespace.py --namespace dynamo + +# run ingress +dynamo run in=http out=dyn --router-mode kv --http-port=8000 & +DYNAMO_PID=$! + +# run worker +python3 components/worker.py \ + --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ + --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ + --extra-engine-args configs/agg.yaml \ + --publish-events-and-metrics \ No newline at end of file diff --git a/examples/tensorrt_llm/launch/disagg.sh b/examples/tensorrt_llm/launch/disagg.sh new file mode 100755 index 0000000000..da40d2971c --- /dev/null +++ b/examples/tensorrt_llm/launch/disagg.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Setup cleanup trap +cleanup() { + echo "Cleaning up background processes..." + kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true + wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true + echo "Cleanup complete." +} +trap cleanup EXIT INT TERM + +# run clear_namespace +python3 utils/clear_namespace.py --namespace dynamo + +# run ingress +dynamo run in=http out=dyn --http-port=8000 & +DYNAMO_PID=$! + +# run prefill worker +python3 components/worker.py \ + --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --extra-engine-args configs/prefill.yaml \ + --disaggregation-mode prefill & +PREFILL_PID=$! + +# run decode worker +python3 components/worker.py \ + --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --extra-engine-args configs/decode.yaml \ + --disaggregation-mode decode \ No newline at end of file diff --git a/examples/tensorrt_llm/launch/disagg_prefill_first_strategy.sh b/examples/tensorrt_llm/launch/disagg_prefill_first_strategy.sh new file mode 100755 index 0000000000..2aa4130366 --- /dev/null +++ b/examples/tensorrt_llm/launch/disagg_prefill_first_strategy.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Setup cleanup trap +cleanup() { + echo "Cleaning up background processes..." + kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true + wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true + echo "Cleanup complete." +} +trap cleanup EXIT INT TERM + +# run clear_namespace +python3 utils/clear_namespace.py --namespace dynamo + +# run ingress +dynamo run in=http out=dyn --http-port=8000 & +DYNAMO_PID=$! + +# run prefill worker +python3 components/worker.py \ + --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --extra-engine-args configs/prefill.yaml \ + --disaggregation-mode prefill \ + --disaggregation-strategy prefill_first& +PREFILL_PID=$! + +# run decode worker +python3 components/worker.py \ + --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --extra-engine-args configs/decode.yaml \ + --disaggregation-mode decode \ + --disaggregation-strategy prefill_first \ No newline at end of file diff --git a/examples/tensorrt_llm/launch/disagg_router.sh b/examples/tensorrt_llm/launch/disagg_router.sh new file mode 100755 index 0000000000..0fc81d7c8d --- /dev/null +++ b/examples/tensorrt_llm/launch/disagg_router.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Setup cleanup trap +cleanup() { + echo "Cleaning up background processes..." + kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true + wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true + echo "Cleanup complete." +} +trap cleanup EXIT INT TERM + +# run clear_namespace +python3 utils/clear_namespace.py --namespace dynamo + +# run ingress +dynamo run in=http out=dyn --router-mode kv --http-port=8000 & +DYNAMO_PID=$! + +# run prefill worker +python3 components/worker.py \ + --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --extra-engine-args configs/prefill.yaml \ + --disaggregation-mode prefill \ + --disaggregation-strategy prefill_first \ + --publish-events-and-metrics & +PREFILL_PID=$! + +# run decode worker +python3 components/worker.py \ + --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --extra-engine-args configs/decode.yaml \ + --disaggregation-mode decode \ + --disaggregation-strategy prefill_first \ No newline at end of file diff --git a/examples/tensorrt_llm/utils/clear_namespace.py b/examples/tensorrt_llm/utils/clear_namespace.py new file mode 100644 index 0000000000..e7fcc46485 --- /dev/null +++ b/examples/tensorrt_llm/utils/clear_namespace.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2020 Atalaya Tech. Inc +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modifications Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES + +import argparse +import asyncio +import logging + +from dynamo.runtime import DistributedRuntime, EtcdKvCache, dynamo_worker +from dynamo.runtime.logging import configure_dynamo_logging + +configure_dynamo_logging() +logger = logging.getLogger(__name__) + + +@dynamo_worker() +async def clear_namespace(runtime: DistributedRuntime, namespace: str): + etcd_kv_cache = await EtcdKvCache.create( + runtime.etcd_client(), + f"/{namespace}/", + {}, + ) + await etcd_kv_cache.clear_all() + logger.info(f"Cleared /{namespace} in EtcdKvCache") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--namespace", type=str, required=True) + args = parser.parse_args() + asyncio.run(clear_namespace(args.namespace)) diff --git a/examples/tensorrt_llm/utils/disagg_utils.py b/examples/tensorrt_llm/utils/disagg_utils.py new file mode 100644 index 0000000000..70b2a51feb --- /dev/null +++ b/examples/tensorrt_llm/utils/disagg_utils.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 + +from tensorrt_llm.llmapi import DisaggregatedParams + + +class DisaggregatedParamsCodec: + """ + Codec for encoding and decoding disaggregated params for network transfer. + """ + + @staticmethod + def decode( + disaggregated_params: DisaggregatedParams, + ) -> DisaggregatedParams: + if disaggregated_params is None: + return None + + opaque_state = ( + base64.b64decode(disaggregated_params.opaque_state) + if disaggregated_params.opaque_state is not None + else None + ) + return DisaggregatedParams( + request_type=disaggregated_params.request_type, + first_gen_tokens=disaggregated_params.first_gen_tokens, + ctx_request_id=disaggregated_params.ctx_request_id, + opaque_state=opaque_state, + draft_tokens=disaggregated_params.draft_tokens, + ) + + @staticmethod + def encode( + disaggregated_params: DisaggregatedParams, + ) -> DisaggregatedParams: + if disaggregated_params is None: + return None + + encoded_opaque_state = ( + base64.b64encode(disaggregated_params.opaque_state).decode("utf-8") + if disaggregated_params.opaque_state is not None + else None + ) + return DisaggregatedParams( + request_type=disaggregated_params.request_type, + first_gen_tokens=disaggregated_params.first_gen_tokens, + ctx_request_id=disaggregated_params.ctx_request_id, + opaque_state=encoded_opaque_state, + draft_tokens=disaggregated_params.draft_tokens, + ) diff --git a/examples/tensorrt_llm/utils/request_handlers/handler_base.py b/examples/tensorrt_llm/utils/request_handlers/handler_base.py new file mode 100644 index 0000000000..0dc3bb76b4 --- /dev/null +++ b/examples/tensorrt_llm/utils/request_handlers/handler_base.py @@ -0,0 +1,145 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from dataclasses import asdict, dataclass + +from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams +from utils.disagg_utils import DisaggregatedParams, DisaggregatedParamsCodec + +from dynamo.runtime.logging import configure_dynamo_logging + +configure_dynamo_logging() + + +@dataclass +class RequestHandlerConfig: + """ + Configuration for the request handler + """ + + component: object + engine: object + default_sampling_params: object + publisher: object + disaggregation_mode: str + + +class HandlerBase: + """ + Base class for request handlers. + """ + + def __init__(self, config: RequestHandlerConfig): + self.engine = config.engine + self.component = config.component + self.default_sampling_params = config.default_sampling_params + self.publisher = config.publisher + self.disaggregation_mode = config.disaggregation_mode + self.disaggregation_strategy = config.disaggregation_strategy + self.next_client = config.next_client + self.first_generation = True + + def check_error(self, result: dict): + """ + Check if there is an error in the result. + """ + if self.disaggregation_mode == "prefill": + return result["finish_reason"] == "error" + else: + return result["finish_reason"] == "stop" or result["finish_reason"] == "error" + + async def generate_locally(self, request: dict): + """ + Generate responses based on the disaggregation mode in the request. + """ + + logging.debug(f"Request: {request}") + + # Check if there is an error in the publisher error queue + publishers_error = ( + self.publisher.check_error_queue() if self.publisher else None + ) + if publishers_error: + raise publishers_error + + inputs = request["token_ids"] + + # Decode the disaggregated params from the request + disaggregated_params = None + if self.disaggregation_mode == "prefill": + request["stop_conditions"]["max_tokens"] = 1 + disaggregated_params = LlmDisaggregatedParams(request_type="context_only") + + if "disaggregated_params" in request: + disaggregated_params = DisaggregatedParamsCodec.decode( + DisaggregatedParams(**request["disaggregated_params"]) + ) + disaggregated_params.request_type = "generation_only" + + if self.disaggregation_mode == "decode" and disaggregated_params is None: + raise ValueError("Disaggregated params are required for decode mode") + + num_output_tokens_so_far = 0 + + sampling_params = self.default_sampling_params + for key, value in request["sampling_options"].items(): + if not value: + continue + if hasattr(sampling_params, key): + setattr(sampling_params, key, value) + + max_tokens = request["stop_conditions"]["max_tokens"] + if max_tokens: + sampling_params.max_tokens = max_tokens + + # TODO: Instead of True, we should use streaming from the request. + # However, currently dynamo run does not send streaming in the request. + streaming = False if self.disaggregation_mode == "prefill" else True + + async for res in self.engine.llm.generate_async( + inputs=inputs, + sampling_params=sampling_params, + disaggregated_params=disaggregated_params, + streaming=streaming, + ): + # TRTLLM engine needs to start generating tokens first before stats + # can be retrieved. + if self.first_generation and self.publisher: + self.publisher.start() + self.first_generation = False + + if res.finished and self.disaggregation_mode != "prefill": + yield {"finish_reason": "stop", "token_ids": []} + break + + if not res.outputs: + yield {"finish_reason": "error", "token_ids": []} + break + + output = res.outputs[0] + next_total_toks = len(output.token_ids) + out = {"token_ids": output.token_ids[num_output_tokens_so_far:]} + if output.finish_reason: + out["finish_reason"] = output.finish_reason + if output.stop_reason: + out["stop_reason"] = output.stop_reason + if self.disaggregation_mode == "prefill": + # Return the disaggregated params only when operating in prefill mode. + out["disaggregated_params"] = asdict( + DisaggregatedParamsCodec.encode(output.disaggregated_params) + ) + yield out + num_output_tokens_so_far = next_total_toks diff --git a/examples/tensorrt_llm/utils/request_handlers/handlers.py b/examples/tensorrt_llm/utils/request_handlers/handlers.py new file mode 100644 index 0000000000..4498960e61 --- /dev/null +++ b/examples/tensorrt_llm/utils/request_handlers/handlers.py @@ -0,0 +1,141 @@ +import copy +from dataclasses import dataclass + +from utils.request_handlers.handler_base import HandlerBase + + +@dataclass +class RequestHandlerConfig: + """ + Configuration for the request handler + """ + + component: object + engine: object + default_sampling_params: object + publisher: object + disaggregation_mode: str + disaggregation_strategy: str + next_client: object + + +class RequestHandlerFactory: + def __init__(self): + self.handlers = { + "prefill": PrefillHandler, + "decode": DecodeHandler, + "prefill_and_decode": AggregatedHandler, + } + + def _validate_config(self, config: RequestHandlerConfig): + if config.disaggregation_mode not in self.handlers: + raise ValueError( + f"Invalid disaggregation_mode '{config.disaggregation_mode}'. " + f"Supported modes: {list(self.handlers.keys())}" + ) + + if not config.next_client: + if ( + config.disaggregation_mode == "prefill" + and config.disaggregation_strategy == "prefill_first" + ): + raise ValueError( + "Next client is required for the main worker when disaggregation_mode='prefill' and disaggregation_strategy='prefill_first'." + ) + if ( + config.disaggregation_mode == "decode" + and config.disaggregation_strategy == "decode_first" + ): + raise ValueError( + "Next client is required for the decode worker when disaggregation_mode='decode' and disaggregation_strategy='decode_first'." + ) + + def get_request_handler(self, config: RequestHandlerConfig) -> HandlerBase: + self._validate_config(config) + return self.handlers[config.disaggregation_mode](config) + + +def get_request_handler(config: RequestHandlerConfig) -> HandlerBase: + return RequestHandlerFactory().get_request_handler(config) + + +class AggregatedHandler(HandlerBase): + """ + Handler for the aggregated mode. + """ + + def __init__(self, config: RequestHandlerConfig): + super().__init__(config) + + async def generate(self, request: dict): + # Implement all steps locally. + async for res in self.generate_locally(request): + yield res + + +class PrefillHandler(HandlerBase): + """ + Handler for the prefill mode. + """ + + def __init__(self, config: RequestHandlerConfig): + super().__init__(config) + + async def remote_decode(self, request: dict): + async for res in await self.next_client.round_robin(request): + yield res.data() + + async def generate(self, request: dict): + # Generate the prefill response locally + prefill_request = copy.deepcopy(request) + prefill_response = None + response_count = 0 + async for res in self.generate_locally(prefill_request): + prefill_response = res + response_count += 1 + if response_count > 1: + raise ValueError("Prefill response should be generated only once.") + + if self.disaggregation_strategy == "prefill_first" and not self.check_error(prefill_response): + # If operating under prefill_first strategy, the prefill handler needs to trigger + # the decode handler. + request["disaggregated_params"] = prefill_response["disaggregated_params"] + async for res in self.remote_decode(request): + yield res + else: + # Return response to the decode handler. + yield prefill_response + + +class DecodeHandler(HandlerBase): + """ + Handler for the decode mode. + """ + + def __init__(self, config: RequestHandlerConfig): + super().__init__(config) + + async def remote_prefill(self, request: dict): + async for res in await self.next_client.round_robin(request): + yield res + + async def generate(self, request: dict): + if self.disaggregation_strategy == "decode_first": + prefill_response = None + # If operating under decode_first strategy, the decode handler needs to trigger + # the prefill handler. + response_count = 0 + async for res in self.remote_prefill(request): + prefill_response = res + response_count += 1 + if response_count > 1: + raise ValueError("Prefill response should be generated only once.") + + if self.check_error(prefill_response.data()): + yield prefill_response.data() + return + request["disaggregated_params"] = prefill_response.data()["disaggregated_params"] + + + async for res in self.generate_locally(request): + yield res diff --git a/examples/tensorrt_llm/utils/trtllm_utils.py b/examples/tensorrt_llm/utils/trtllm_utils.py new file mode 100644 index 0000000000..740210106f --- /dev/null +++ b/examples/tensorrt_llm/utils/trtllm_utils.py @@ -0,0 +1,177 @@ +import argparse +from typing import Optional + +# Default endpoint for the next worker. +DEFAULT_ENDPOINT = "dyn://dynamo.tensorrt_llm.generate" +DEFAULT_MODEL_PATH = "TinyLlama-1.1B-Instruct" +DEFAULT_NEXT_ENDPOINT = "dyn://dynamo.tensorrt_llm_next.generate" +DEFAULT_DISAGGREGATION_STRATEGY = "decode_first" +DEFAULT_DISAGGREGATION_MODE = "prefill_and_decode" + + +class Config: + """Command line parameters or defaults""" + + namespace: str + component: str + endpoint: str + model_path: str + served_model_name: Optional[str] = None + tensor_parallel_size: int + kv_block_size: int + extra_engine_args: str + publish_events_and_metrics: bool + disaggregation_mode: str + disaggregation_strategy: str + next_endpoint: str + + def __str__(self) -> str: + return ( + f"Config(namespace={self.namespace}, " + f"component={self.component}, " + f"endpoint={self.endpoint}, " + f"model_path={self.model_path}, " + f"served_model_name={self.served_model_name}, " + f"tensor_parallel_size={self.tensor_parallel_size}, " + f"kv_block_size={self.kv_block_size}, " + f"extra_engine_args={self.extra_engine_args}, " + f"publish_events_and_metrics={self.publish_events_and_metrics}, " + f"disaggregation_mode={self.disaggregation_mode}, " + f"disaggregation_strategy={self.disaggregation_strategy}, " + f"next_endpoint={self.next_endpoint})" + ) + + +def is_first_worker(config): + """ + Check if the current worker is the first worker in the disaggregation chain. + """ + is_primary_worker = config.disaggregation_mode == "prefill_and_decode" + if not is_primary_worker: + is_primary_worker = (config.disaggregation_strategy == "prefill_first") and ( + config.disaggregation_mode == "prefill" + ) + + if not is_primary_worker: + is_primary_worker = (config.disaggregation_strategy == "decode_first") and ( + config.disaggregation_mode == "decode" + ) + + return is_primary_worker + + +def parse_endpoint(endpoint: str) -> tuple[str, str, str]: + endpoint_str = endpoint.replace("dyn://", "", 1) + endpoint_parts = endpoint_str.split(".") + if len(endpoint_parts) != 3: + raise ValueError( + f"Invalid endpoint format: '{endpoint}'. " + "Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'." + ) + + return tuple(endpoint_parts) + + +def cmd_line_args(): + parser = argparse.ArgumentParser( + description="TensorRT-LLM server integrated with Dynamo LLM." + ) + parser.add_argument( + "--endpoint", + type=str, + default="", + help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT} if first worker, {DEFAULT_NEXT_ENDPOINT} if next worker", + ) + parser.add_argument( + "--model-path", + type=str, + default=DEFAULT_MODEL_PATH, + help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL_PATH}", + ) + parser.add_argument( + "--served-model-name", + type=str, + default="", + help="Name to serve the model under. Defaults to deriving it from model path.", + ) + parser.add_argument( + "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use." + ) + # IMPORTANT: We should ideally not expose this to users. We should be able to + # query the block size from the TRTLLM engine. + parser.add_argument( + "--kv-block-size", type=int, default=32, help="Size of a KV cache block." + ) + + parser.add_argument( + "--extra-engine-args", + type=str, + default="", + help="Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.", + ) + parser.add_argument( + "--publish-events-and-metrics", + action="store_true", + help="Publish events and metrics to the dynamo components. Note: This is not supported when running in prefill disaggregation mode.", + ) + parser.add_argument( + "--disaggregation-mode", + type=str, + default=DEFAULT_DISAGGREGATION_MODE, + help=f"Mode to use for disaggregation. Default: {DEFAULT_DISAGGREGATION_MODE}", + ) + parser.add_argument( + "--disaggregation-strategy", + type=str, + default=DEFAULT_DISAGGREGATION_STRATEGY, + help=f"Strategy to use for disaggregation. Default: {DEFAULT_DISAGGREGATION_STRATEGY}", + ) + parser.add_argument( + "--next-endpoint", + type=str, + default="", + help=f"Endpoint(in 'dyn://namespace.component.endpoint' format) to send requests to when running in disaggregation mode. Default: {DEFAULT_NEXT_ENDPOINT} if first worker, empty if next worker", + ) + args = parser.parse_args() + + # Set the appropriate defaults for the endpoint and next endpoint. + if is_first_worker(args): + if args.endpoint == "": + args.endpoint = DEFAULT_ENDPOINT + if ( + args.next_endpoint == "" + and args.disaggregation_mode != "prefill_and_decode" + ): + args.next_endpoint = DEFAULT_NEXT_ENDPOINT + else: + if args.endpoint == "": + args.endpoint = DEFAULT_NEXT_ENDPOINT + if args.next_endpoint != "": + raise ValueError("Next endpoint is not allowed for the next worker") + + endpoint = args.endpoint + + config = Config() + config.model_path = args.model_path + if args.served_model_name: + config.served_model_name = args.served_model_name + else: + # This becomes an `Option` on the Rust side + config.served_model_name = None + + parsed_namespace, parsed_component_name, parsed_endpoint_name = parse_endpoint( + endpoint + ) + + config.namespace = parsed_namespace + config.component = parsed_component_name + config.endpoint = parsed_endpoint_name + config.tensor_parallel_size = args.tensor_parallel_size + config.kv_block_size = args.kv_block_size + config.extra_engine_args = args.extra_engine_args + config.publish_events_and_metrics = args.publish_events_and_metrics + config.disaggregation_mode = args.disaggregation_mode + config.disaggregation_strategy = args.disaggregation_strategy + config.next_endpoint = args.next_endpoint + + return config From 42e3082e39da61de871770cae33f53efd6e35895 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 15:42:24 -0700 Subject: [PATCH 02/22] Add documentation and launch clean-up --- examples/tensorrt_llm/README.md | 177 +++++++++++ .../{configs => engine_configs}/agg.yaml | 0 .../{configs => engine_configs}/decode.yaml | 0 .../engine_configs/deepseek_r1/README.md | 19 ++ .../engine_configs/deepseek_r1/agg.yaml | 54 ++++ .../engine_configs/deepseek_r1/decode.yaml | 55 ++++ .../engine_configs/deepseek_r1/mtp/agg.yaml | 50 +++ .../deepseek_r1/mtp/decode.yaml | 0 .../deepseek_r1/mtp/prefill.yaml | 0 .../engine_configs/deepseek_r1/prefill.yaml | 37 +++ .../deepseek_r1/wide_ep/dep16_agg.yaml | 27 ++ .../deepseek_r1/wide_ep/eplb.yaml | 7 + .../deepseek_r1/wide_ep/wide_ep_agg.yaml | 35 +++ .../deepseek_r1/wide_ep/wide_ep_decode.yaml | 59 ++++ .../deepseek_r1/wide_ep/wide_ep_prefill.yaml | 0 .../{configs => engine_configs}/prefill.yaml | 0 examples/tensorrt_llm/kv-cache-tranfer.md | 70 +++++ examples/tensorrt_llm/launch/agg.sh | 11 +- examples/tensorrt_llm/launch/agg_router.sh | 13 +- examples/tensorrt_llm/launch/disagg.sh | 25 +- .../launch/disagg_prefill_first_strategy.sh | 36 --- examples/tensorrt_llm/launch/disagg_router.sh | 42 ++- .../multinode/multinode-examples.md | 285 ++++++++++++++++++ .../multinode/slurm_launch/srun_aggregated.sh | 74 +++++ .../slurm_launch/srun_disaggregated.sh | 96 ++++++ .../slurm_launch/start_frontend_services.sh | 16 + .../slurm_launch/start_trtllm_worker.sh | 39 +++ examples/tensorrt_llm/utils/get_env_vars.sh | 46 +++ .../utils/request_handlers/handler_base.py | 4 +- .../utils/request_handlers/handlers.py | 9 +- 30 files changed, 1220 insertions(+), 66 deletions(-) create mode 100644 examples/tensorrt_llm/README.md rename examples/tensorrt_llm/{configs => engine_configs}/agg.yaml (100%) rename examples/tensorrt_llm/{configs => engine_configs}/decode.yaml (100%) create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/README.md create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/agg.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/decode.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/agg.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/decode.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/prefill.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/prefill.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/eplb.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml rename examples/tensorrt_llm/{configs => engine_configs}/prefill.yaml (100%) create mode 100644 examples/tensorrt_llm/kv-cache-tranfer.md delete mode 100755 examples/tensorrt_llm/launch/disagg_prefill_first_strategy.sh create mode 100644 examples/tensorrt_llm/multinode/multinode-examples.md create mode 100644 examples/tensorrt_llm/multinode/slurm_launch/srun_aggregated.sh create mode 100644 examples/tensorrt_llm/multinode/slurm_launch/srun_disaggregated.sh create mode 100644 examples/tensorrt_llm/multinode/slurm_launch/start_frontend_services.sh create mode 100644 examples/tensorrt_llm/multinode/slurm_launch/start_trtllm_worker.sh create mode 100644 examples/tensorrt_llm/utils/get_env_vars.sh diff --git a/examples/tensorrt_llm/README.md b/examples/tensorrt_llm/README.md new file mode 100644 index 0000000000..80b3c02035 --- /dev/null +++ b/examples/tensorrt_llm/README.md @@ -0,0 +1,177 @@ + + +# LLM Deployment Examples using TensorRT-LLM + +This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using TensorRT-LLM. + +# User Documentation + +- [Deployment Architectures](#deployment-architectures) +- [Getting Started](#getting-started) + - [Prerequisites](#prerequisites) + - [Build docker](#build-docker) + - [Run container](#run-container) + - [Run deployment](#run-deployment) + - [Single Node deployment](#single-node-example-architectures) + - [Multinode deployment](#multinode-deployment) + - [Client](#client) + - [Benchmarking](#benchmarking) + - [Close Deployment](#close-deployment) +- [KV Cache Transfer](#kv-cache-transfer-in-disaggregated-serving) + +# Quick Start + +## Use the Latest Release + +We recommend using the latest stable release of dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +## Deployment Architectures + +See [deployment architectures](../llm/README.md#deployment-architectures) to learn about the general idea of the architecture. + +Note: TensorRT-LLM disaggregation does not support conditional disaggregation yet. You can configure the deployment to always use either aggregate or disaggregated serving. + +## Getting Started + +1. Choose a deployment architecture based on your requirements +2. Configure the components as needed +3. Deploy using the provided scripts + +### Prerequisites + +Start required services (etcd and NATS) using [Docker Compose](../../deploy/metrics/docker-compose.yml) +```bash +docker compose -f deploy/metrics/docker-compose.yml up -d +``` + +### Build docker + +```bash +# TensorRT-LLM uses git-lfs, which needs to be installed in advance. +apt-get update && apt-get -y install git git-lfs + +# On an x86 machine: +./container/build.sh --framework tensorrtllm + +# On an ARM machine: +./container/build.sh --framework tensorrtllm --platform linux/arm64 + +# Build the container with the default experimental TensorRT-LLM commit +# WARNING: This is for experimental feature testing only. +# The container should not be used in a production environment. +./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit +``` + +### Run container + +``` +./container/run.sh --framework tensorrtllm -it +``` +## Run Deployment + +This figure shows an overview of the major components to deploy: + + + +``` + ++------+ +-----------+ +------------------+ +---------------+ +| HTTP |----->| processor |----->| Worker |------------>| Prefill | +| |<-----| |<-----| |<------------| Worker | ++------+ +-----------+ +------------------+ +---------------+ + | ^ | + query best | | return | publish kv events + worker | | worker_id v + | | +------------------+ + | +---------| kv-router | + +------------->| | + +------------------+ + +``` + +Note: The above architecture illustrates all the components. The final components +that get spawned depend upon the chosen graph. + +### Single-Node example architectures + +> [!IMPORTANT] +> Below we provide some simple shell scripts that run the components for each configuration. Each shell script is simply running the `dynamo-run` to start up the ingress and using `python3` to start up the workers. You can easily take each commmand and run them in separate terminals. + +#### Aggregated +```bash +cd $DYNAMO_ROOT/examples/tensorrt_llm +./launch/agg.sh +``` + +#### Aggregated with KV Routing +```bash +cd $DYNAMO_ROOT/examples/tensorrt_llm +./launch/agg_router.sh +``` + +#### Disaggregated + +> [!IMPORTANT] +> Disaggregated serving supports two strategies for request flow: `"prefill_first"` and `"decode_first"`. By default, the script below uses the `"decode_first"` strategy, which can reduce response latency by minimizing extra hops in the return path. You can switch strategies by setting the `DISAGGREGATION_STRATEGY` environment variable. + +```bash +cd $DYNAMO_ROOT/examples/tensorrt_llm +./launch/disagg.sh +``` + +#### Disaggregated with KV Routing + +> [!IMPORTANT] +> Disaggregated serving with KV routing uses a "prefill first" workflow by default. Currently, Dynamo supports KV routing to only one endpoint per model. In disaggregated workflow, it is generally more effective to route requests to the prefill worker. If you wish to use a "decode first" workflow instead, you can simply set the `DISAGGREGATION_STRATEGY` environment variable accordingly. + +```bash +cd $DYNAMO_ROOT/examples/tensorrt_llm +./launch/disagg_router.sh +``` + +### Multinode Deployment + +For details and instructions on multinode serving, please refer to the [multinode-examples.md](./multinode-examples.md) document. This guide provides step-by-step examples and configuration tips for deploying Dynamo with TensorRT-LLM across multiple nodes. + + +### Client + +See [client](../llm/README.md#client) section to learn how to send request to the deployment. + +NOTE: To send a request to a multi-node deployment, target the node which is running `dynamo-run in=http`. + +### Benchmarking + +To benchmark your deployment with GenAI-Perf, see this utility script, configuring the +`model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh) + +### Close deployment + +See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) section to learn about how to close the deployment. + +### KV Cache Transfer in Disaggregated Serving + +Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disaggregated serving: UCX (default) and NIXL (experimental). For detailed information and configuration instructions for each method, see the [KV cache transfer guide](./kv-cache-tranfer.md). diff --git a/examples/tensorrt_llm/configs/agg.yaml b/examples/tensorrt_llm/engine_configs/agg.yaml similarity index 100% rename from examples/tensorrt_llm/configs/agg.yaml rename to examples/tensorrt_llm/engine_configs/agg.yaml diff --git a/examples/tensorrt_llm/configs/decode.yaml b/examples/tensorrt_llm/engine_configs/decode.yaml similarity index 100% rename from examples/tensorrt_llm/configs/decode.yaml rename to examples/tensorrt_llm/engine_configs/decode.yaml diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/README.md b/examples/tensorrt_llm/engine_configs/deepseek_r1/README.md new file mode 100644 index 0000000000..5c7a87782c --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/README.md @@ -0,0 +1,19 @@ + + + +This folder contains launch scripts for \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/agg.yaml new file mode 100644 index 0000000000..a7f9ce2ced --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/agg.yaml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +backend: pytorch + +# TP/EP/PP/DP +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +pipeline_parallel_size: 1 +enable_attention_dp: false + +max_batch_size: 256 +# 8448 = 8192 ISL + 256 OSL +max_num_tokens: 8448 +max_seq_len: 8448 + +kv_cache_config: + # With dp attention disabled: high free_gpu_memory_fraction is fine. + free_gpu_memory_fraction: 0.85 + # With dp attention enabled: large ISL at high concurrency may need + # free_gpu_memory_fraction low to have enough available memory. + # free_gpu_memory_fraction: 0.30 + +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 +use_cuda_graph: true +cuda_graph_padding_enabled: true +# NOTE: For larger max batch size, you may want to add larger cuda graph +# batch sizes below to match. +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +print_iter_log: true +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/decode.yaml new file mode 100644 index 0000000000..51a5bb2471 --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/decode.yaml @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +backend: pytorch + +# TP/EP/PP/DP +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +pipeline_parallel_size: 1 +enable_attention_dp: false + +max_batch_size: 256 +max_num_tokens: 256 +# 8448 = 8192 ISL + 256 OSL +max_seq_len: 8448 + +kv_cache_config: + # With dp attention disabled: high free_gpu_memory_fraction is fine. + free_gpu_memory_fraction: 0.85 + # With dp attention enabled: large ISL at high concurrency may need + # free_gpu_memory_fraction low to have enough available memory. + # free_gpu_memory_fraction: 0.30 + +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 +disable_overlap_scheduler: false +use_cuda_graph: true +cuda_graph_padding_enabled: true +# NOTE: For larger max batch size, you may want to add larger cuda graph +# batch sizes below to match. +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +print_iter_log: true +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/agg.yaml new file mode 100644 index 0000000000..20944defed --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/agg.yaml @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: FP4 only supported starting with Blackwell GPUs. +# https://huggingface.co/nvidia/DeepSeek-R1-FP4 +# You can also specify the full path to locally downloaded weights +# instead of a HuggingFace ID here. + +backend: pytorch +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +enable_attention_dp: true +max_batch_size: 256 +# 8448 = 8192 ISL + 256 OSL +max_num_tokens: 8448 +max_seq_len: 8448 +kv_cache_config: + free_gpu_memory_fraction: 0.30 + +# Enable the MTP(Multi-Token Prediction) in the model engine +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +use_cuda_graph: true +cuda_graph_padding_enabled: true +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +print_iter_log: true +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/decode.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/prefill.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/prefill.yaml new file mode 100644 index 0000000000..0221dcbedf --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/prefill.yaml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +backend: pytorch + +# TP/EP/PP/DP +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +pipeline_parallel_size: 1 +enable_attention_dp: true + +max_batch_size: 1 +max_num_tokens: 8192 +max_seq_len: 8192 + +kv_cache_config: + free_gpu_memory_fraction: 0.75 + +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 +disable_overlap_scheduler: true +print_iter_log: true +# NOTE: This dtype must match in both prefill/decode configs +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml new file mode 100644 index 0000000000..f02d3388b0 --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Example of a Multi-node worker, but no WideEP or EPLB. +# See wide_ep*.yaml for WideEP example configs. +backend: pytorch +tensor_parallel_size: 16 +moe_expert_parallel_size: 16 +enable_attention_dp: true +max_batch_size: 256 +max_num_tokens: 256 +max_seq_len: 8448 +kv_cache_config: + free_gpu_memory_fraction: 0.7 +use_cuda_graph: true +cuda_graph_padding_enabled: true +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/eplb.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/eplb.yaml new file mode 100644 index 0000000000..60e85e5fa9 --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/eplb.yaml @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# moe_load_balancer settings for TRTLLM based on: +# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer +num_slots: 288 +layer_updates_per_iter: 2 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml new file mode 100644 index 0000000000..84e35930de --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +backend: pytorch + +# WideEP related settings +moe_backend: WideEP +# moe_max_num_tokens will default to max_num_tokens if left unspecified. +# +# If you want to set this value explicitly, one recommendation is below: +# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size +# 4096 = 256 * 16 +# moe_max_num_tokens: 4096 +moe_load_balancer: /mnt/engine_configs/eplb.yaml +tensor_parallel_size: 16 +moe_expert_parallel_size: 16 + +enable_attention_dp: true +max_batch_size: 256 +max_num_tokens: 256 +max_seq_len: 8448 +kv_cache_config: + free_gpu_memory_fraction: 0.7 +use_cuda_graph: true +cuda_graph_padding_enabled: true +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml new file mode 100644 index 0000000000..dc43788f94 --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +backend: pytorch + +# WideEP related settings +moe_backend: WideEP +moe_load_balancer: /mnt/engine_configs/eplb.yaml + +# TP/EP/PP/DP +tensor_parallel_size: 16 +moe_expert_parallel_size: 16 +pipeline_parallel_size: 1 +enable_attention_dp: true + +max_batch_size: 256 +max_num_tokens: 256 +# 8448 = 8192 ISL + 256 OSL +max_seq_len: 8448 + +kv_cache_config: + # With dp attention disabled: high free_gpu_memory_fraction is fine. + # free_gpu_memory_fraction: 0.85 + # With dp attention enabled: large ISL at high concurrency may need + # free_gpu_memory_fraction low to have enough available memory. + free_gpu_memory_fraction: 0.30 + +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 +disable_overlap_scheduler: false +use_cuda_graph: true +cuda_graph_padding_enabled: true +# NOTE: For larger max batch size, you may want to add larger cuda graph +# batch sizes below to match. +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +print_iter_log: true +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/tensorrt_llm/configs/prefill.yaml b/examples/tensorrt_llm/engine_configs/prefill.yaml similarity index 100% rename from examples/tensorrt_llm/configs/prefill.yaml rename to examples/tensorrt_llm/engine_configs/prefill.yaml diff --git a/examples/tensorrt_llm/kv-cache-tranfer.md b/examples/tensorrt_llm/kv-cache-tranfer.md new file mode 100644 index 0000000000..14247f71fe --- /dev/null +++ b/examples/tensorrt_llm/kv-cache-tranfer.md @@ -0,0 +1,70 @@ + + + + +# KV Cache Transfer in Disaggregated Serving + +In disaggregated serving architectures, KV cache must be transferred between prefill and decode workers. TensorRT-LLM supports two methods for this transfer: + +## Default Method: UCX +By default, TensorRT-LLM uses UCX (Unified Communication X) for KV cache transfer between prefill and decode workers. UCX provides high-performance communication optimized for GPU-to-GPU transfers. + +## Experimental Method: NIXL +TensorRT-LLM also provides experimental support for using **NIXL** (NVIDIA Inference Xfer Library) for KV cache transfer. [NIXL](https://github.com/ai-dynamo/nixl) is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments. + +**Note:** NIXL support in TensorRT-LLM is experimental and is not suitable for production environments yet. + +## Using NIXL for KV Cache Transfer + +**Note:** NIXL backend for TensorRT-LLM is currently only supported on AMD64 (x86_64) architecture. If you're running on ARM64, you'll need to use the default UCX method for KV cache transfer. + +To enable NIXL for KV cache transfer in disaggregated serving: + +1. **Build the container with NIXL support:** + The TensorRT-LLM wheel must be built from source with NIXL support. The `./container/build.sh` script caches previously built TensorRT-LLM wheels to reduce build time. If you have previously built a TensorRT-LLM wheel without NIXL support, you must delete the cached wheel to force a rebuild with NIXL support. + + **Remove cached TensorRT-LLM wheel (only if previously built without NIXL support):** + ```bash + rm -rf /tmp/trtllm_wheel + ``` + + **Build the container with NIXL support:** + ```bash + ./container/build.sh --framework tensorrtllm \ + --use-default-experimental-tensorrtllm-commit \ + --trtllm-use-nixl-kvcache-experimental + ``` + + **Note:** Both `--use-default-experimental-tensorrtllm-commit` and `--trtllm-use-nixl-kvcache-experimental` flags are required to enable NIXL support. + +2. **Run the containerized environment:** + See [run container](./README.md#run-container) section to learn how to start the container image built in previous step. + +3. **Start the disaggregated service:** + See [disaggregated serving](./README.md#disaggregated-serving) to see how to start the deployment. + +4. **Send the request:** + See [client](./README.md#client) section to learn how to send the request to deployment. + +**Important:** Ensure that ETCD and NATS services are running before starting the service. + +The container will automatically configure the appropriate environment variables (`TRTLLM_USE_NIXL_KVCACHE=1`) when built with the NIXL flag. The same container image can be used to use UCX for KV cache transfer. +```bash +unset TRTLLM_USE_NIXL_KVCACHE +export TRTLLM_USE_UCX_KVCACHE=1 +``` \ No newline at end of file diff --git a/examples/tensorrt_llm/launch/agg.sh b/examples/tensorrt_llm/launch/agg.sh index 29fded4664..1744e08c5b 100755 --- a/examples/tensorrt_llm/launch/agg.sh +++ b/examples/tensorrt_llm/launch/agg.sh @@ -2,6 +2,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# Environment variables with defaults +export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} +export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} + # Setup cleanup trap cleanup() { echo "Cleaning up background processes..." @@ -20,6 +25,6 @@ DYNAMO_PID=$! # run worker python3 components/worker.py \ - --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ - --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ - --extra-engine-args configs/agg.yaml + --model-path "$MODEL_PATH" \ + --served-model-name "$SERVED_MODEL_NAME" \ + --extra-engine-args "$AGG_ENGINE_ARGS" diff --git a/examples/tensorrt_llm/launch/agg_router.sh b/examples/tensorrt_llm/launch/agg_router.sh index 9aaa7e1711..09a5ff988b 100755 --- a/examples/tensorrt_llm/launch/agg_router.sh +++ b/examples/tensorrt_llm/launch/agg_router.sh @@ -2,6 +2,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# Environment variables with defaults +export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} +export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} + # Setup cleanup trap cleanup() { echo "Cleaning up background processes..." @@ -20,7 +25,7 @@ DYNAMO_PID=$! # run worker python3 components/worker.py \ - --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ - --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ - --extra-engine-args configs/agg.yaml \ - --publish-events-and-metrics \ No newline at end of file + --model-path "$MODEL_PATH" \ + --served-model-name "$SERVED_MODEL_NAME" \ + --extra-engine-args "$AGG_ENGINE_ARGS" \ + --publish-events-and-metrics diff --git a/examples/tensorrt_llm/launch/disagg.sh b/examples/tensorrt_llm/launch/disagg.sh index da40d2971c..d4952838cb 100755 --- a/examples/tensorrt_llm/launch/disagg.sh +++ b/examples/tensorrt_llm/launch/disagg.sh @@ -2,6 +2,15 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# Environment variables with defaults +export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} +export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} +export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"configs/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"configs/decode.yaml"} +export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} +export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} + # Setup cleanup trap cleanup() { echo "Cleaning up background processes..." @@ -19,16 +28,16 @@ dynamo run in=http out=dyn --http-port=8000 & DYNAMO_PID=$! # run prefill worker -python3 components/worker.py \ - --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --extra-engine-args configs/prefill.yaml \ +CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py \ + --model-path "$MODEL_PATH" \ + --served-model-name "$SERVED_MODEL_NAME" \ + --extra-engine-args "$PREFILL_ENGINE_ARGS" \ --disaggregation-mode prefill & PREFILL_PID=$! # run decode worker -python3 components/worker.py \ - --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --extra-engine-args configs/decode.yaml \ +CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \ + --model-path "$MODEL_PATH" \ + --served-model-name "$SERVED_MODEL_NAME" \ + --extra-engine-args "$DECODE_ENGINE_ARGS" \ --disaggregation-mode decode \ No newline at end of file diff --git a/examples/tensorrt_llm/launch/disagg_prefill_first_strategy.sh b/examples/tensorrt_llm/launch/disagg_prefill_first_strategy.sh deleted file mode 100755 index 2aa4130366..0000000000 --- a/examples/tensorrt_llm/launch/disagg_prefill_first_strategy.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Setup cleanup trap -cleanup() { - echo "Cleaning up background processes..." - kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true - wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true - echo "Cleanup complete." -} -trap cleanup EXIT INT TERM - -# run clear_namespace -python3 utils/clear_namespace.py --namespace dynamo - -# run ingress -dynamo run in=http out=dyn --http-port=8000 & -DYNAMO_PID=$! - -# run prefill worker -python3 components/worker.py \ - --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --extra-engine-args configs/prefill.yaml \ - --disaggregation-mode prefill \ - --disaggregation-strategy prefill_first& -PREFILL_PID=$! - -# run decode worker -python3 components/worker.py \ - --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --extra-engine-args configs/decode.yaml \ - --disaggregation-mode decode \ - --disaggregation-strategy prefill_first \ No newline at end of file diff --git a/examples/tensorrt_llm/launch/disagg_router.sh b/examples/tensorrt_llm/launch/disagg_router.sh index 0fc81d7c8d..a373a8f6bd 100755 --- a/examples/tensorrt_llm/launch/disagg_router.sh +++ b/examples/tensorrt_llm/launch/disagg_router.sh @@ -2,6 +2,15 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# Environment variables with defaults +export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} +export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} +export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"configs/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"configs/decode.yaml"} +export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} +export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} + # Setup cleanup trap cleanup() { echo "Cleaning up background processes..." @@ -18,20 +27,31 @@ python3 utils/clear_namespace.py --namespace dynamo dynamo run in=http out=dyn --router-mode kv --http-port=8000 & DYNAMO_PID=$! + +EXTRA_PREFILL_ARGS="" +EXTRA_DECODE_ARGS="" +if [ "$DISAGGREGATION_STRATEGY" == "prefill_first" ]; then + EXTRA_PREFILL_ARGS="--publish-events-and-metrics" +else + EXTRA_DECODE_ARGS="--publish-events-and-metrics" +fi + + # run prefill worker -python3 components/worker.py \ - --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --extra-engine-args configs/prefill.yaml \ +CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py \ + --model-path "$MODEL_PATH" \ + --served-model-name "$SERVED_MODEL_NAME" \ + --extra-engine-args "$PREFILL_ENGINE_ARGS" \ --disaggregation-mode prefill \ - --disaggregation-strategy prefill_first \ - --publish-events-and-metrics & + --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ + $EXTRA_PREFILL_ARGS & PREFILL_PID=$! # run decode worker -python3 components/worker.py \ - --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --served-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --extra-engine-args configs/decode.yaml \ +CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \ + --model-path "$MODEL_PATH" \ + --served-model-name "$SERVED_MODEL_NAME" \ + --extra-engine-args "$DECODE_ENGINE_ARGS" \ --disaggregation-mode decode \ - --disaggregation-strategy prefill_first \ No newline at end of file + --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ + $EXTRA_DECODE_ARGS \ No newline at end of file diff --git a/examples/tensorrt_llm/multinode/multinode-examples.md b/examples/tensorrt_llm/multinode/multinode-examples.md new file mode 100644 index 0000000000..bf48bf623a --- /dev/null +++ b/examples/tensorrt_llm/multinode/multinode-examples.md @@ -0,0 +1,285 @@ + + +# Example: Multi-node TRTLLM Workers with Dynamo on Slurm + +To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16), +the set of nodes need to be launched together in the same MPI world, such as +via `mpirun` or `srun`. This is true regardless of whether the worker is +aggregated, prefill-only, or decode-only. + +In this document we will demonstrate two examples launching multinode workers +on a slurm cluster with `srun`: +1. Deploying an aggregated nvidia/DeepSeek-R1 model as a multi-node TP16/EP16 + worker across 4 GB200 nodes +2. Deploying a disaggregated nvidia/DeepSeek-R1 model with a multi-node + TP16/EP16 prefill worker (4 nodes) and a multi-node TP16/EP16 decode + worker (4 nodes) across a total of 8 GB200 nodes. + +NOTE: Some of the scripts used in this example like `start_frontend_services.sh` and +`start_trtllm_worker.sh` should be translatable to other environments like Kubernetes, or +using `mpirun` directly, with relative ease. + +## Setup + +For simplicity of the example, we will make some assumptions about your slurm cluster: +1. First, we assume you have access to a slurm cluster with multiple GPU nodes + available. For functional testing, most setups should be fine. For performance + testing, you should aim to allocate groups of nodes that are performantly + inter-connected, such as those in an NVL72 setup. +2. Second, we assume this slurm cluster has the [Pyxis](https://github.com/NVIDIA/pyxis) + SPANK plugin setup. In particular, the `srun_aggregated.sh` script in this + example will use `srun` arguments like `--container-image`, + `--container-mounts`, and `--container-env` that are added to `srun` by Pyxis. + If your cluster supports similar container based plugins, you may be able to + modify the script to use that instead. +3. Third, we assume you have already built a recent Dynamo+TRTLLM container image as + described [here](https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker). + This is the image that can be set to the `IMAGE` environment variable in later steps. +4. Fourth, we assume you pre-allocate a group of nodes using `salloc`. We + will allocate 8 nodes below as a reference command to have enough capacity + to run both examples. If you plan to only run the aggregated example, you + will only need 4 nodes. If you customize the configurations to require a + different number of nodes, you can adjust the number of allocated nodes + accordingly. Pre-allocating nodes is technically not a requirement, + but it makes iterations of testing/experimenting easier. + + Make sure to set your `PARTITION` and `ACCOUNT` according to your slurm cluster setup: + ```bash + # Set partition manually based on your slurm cluster's partition names + PARTITION="" + # Set account manually if this command doesn't work on your cluster + ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)" + salloc \ + --partition="${PARTITION}" \ + --account="${ACCOUNT}" \ + --job-name="${ACCOUNT}-dynamo.trtllm" \ + -t 05:00:00 \ + --nodes 8 + ``` +5. Lastly, we will assume you are inside an interactive shell on one of your allocated + nodes, which may be the default behavior after executing the `salloc` command above + depending on the cluster setup. If not, then you should SSH into one of the allocated nodes. + +### Environment Variable Setup + +This example aims to automate as much of the environment setup as possible, +but all slurm clusters and environments are different, and you may need to +dive into the scripts to make modifications based on your specific environment. + +Assuming you have already allocated your nodes via `salloc`, and are +inside an interactive shell on one of the allocated nodes, set the +following environment variables based: +```bash +# NOTE: IMAGE must be set manually for now +# To build an iamge, see the steps here: +# https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker +export IMAGE="" + +# MOUNTS are the host:container path pairs that are mounted into the containers +# launched by each `srun` command. +# +# If you want to reference files, such as $MODEL_PATH below, in a +# different location, you can customize MOUNTS or specify additional +# comma-separated mount pairs here. +# +# NOTE: Currently, this example assumes that the local bash scripts and configs +# referenced are mounted into into /mnt inside the container. If you want to +# customize the location of the scripts, make sure to modify `srun_aggregated.sh` +# accordingly for the new locations of `start_frontend_services.sh` and +# `start_trtllm_worker.sh`. +# +# For example, assuming your cluster had a `/lustre` directory on the host, you +# could add that as a mount like so: +# +# export MOUNTS="${PWD}:/mnt,/lustre:/lustre" +export MOUNTS="${PWD}:/mnt" + +# NOTE: In general, Deepseek R1 is very large, so it is recommended to +# pre-download the model weights and save them in some shared location, +# NFS storage, HF_CACHE, etc. and modify the `--model-path` below +# to reuse the pre-downloaded weights instead. +# +# On Blackwell systems (ex: GB200), it is recommended to use the FP4 weights: +# https://huggingface.co/nvidia/DeepSeek-R1-FP4 +# +# On Hopper systems, FP4 isn't supported so you'll need to use the default weights: +# https://huggingface.co/deepseek-ai/DeepSeek-R1 +export MODEL_PATH="nvidia/DeepSeek-R1-FP4" + +# The name the model will be served/queried under, matching what's +# returned by the /v1/models endpoint. +# +# By default this is inferred from MODEL_PATH, but when using locally downloaded +# model weights, it can be nice to have explicit control over the name. +export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" +``` + +## Aggregated WideEP + +Assuming you have at least 4 nodes allocated following the setup steps above, +follow these steps below to launch an **aggregated** deployment across 4 nodes: + +```bash +# Default set in srun_aggregated.sh, but can customize here. +# export ENGINE_CONFIG="/mnt/engine_configs/wide_ep_agg.yaml" + +# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG +# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of +# total GPUs necessary to satisfy the requested parallelism. For example, +# 4 nodes x 4 gpus/node = 16 gpus total for TP16/EP16. +# export NUM_NODES=4 + +# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this. +# export NUM_GPUS_PER_NODE=4 + +# Launches: +# - frontend + etcd/nats on current (head) node +# - one large aggregated trtllm worker across multiple nodes via MPI tasks +./srun_aggregated.sh +``` + +## Disaggregated WideEP + +Assuming you have at least 8 nodes allocated (4 for prefill, 4 for decode) +following the setup above, follow these steps below to launch a **disaggregated** +deployment across 8 nodes: + +> [!Tip] +> Make sure you have a fresh environment and don't still have the aggregated +> example above still deployed on the same set of nodes. + +```bash +# Defaults set in srun_disaggregated.sh, but can customize here. +# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/wide_ep_prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/wide_ep_decode.yaml" + +# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG +# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG +# The products of NUM_PREFILL_NODES*NUM_GPUS_PER_NODE and +# NUM_DECODE_NODES*NUM_GPUS_PER_NODE should match the respective number of +# GPUs necessary to satisfy the requested parallelism in each config. +# export NUM_PREFILL_NODES=4 +# export NUM_DECODE_NODES=4 + +# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this. +# export NUM_GPUS_PER_NODE=4 + +# Launches: +# - frontend + etcd/nats on current (head) node. +# - one large prefill trtllm worker across multiple nodes via MPI tasks +# - one large decode trtllm worker across multiple nodes via MPI tasks +./srun_disaggregated.sh +``` + +## Understanding the Output + +1. The `srun_aggregated.sh` launches two `srun` jobs. The first launches + etcd, NATS, and the OpenAI frontend on the head node only + called "node1" in the example output below. The second launches + a single TP16 Dynamo+TRTLLM worker spread across 4 nodes, each node + using 4 GPUs each. + ``` + # Frontend/etcd/nats services + srun: launching StepId=453374.17 on host node1, 1 tasks: 0 + ... + # TP16 TRTLLM worker split across 4 nodes with 4 gpus each + srun: launching StepId=453374.18 on host node1, 4 tasks: [0-3] + srun: launching StepId=453374.18 on host node2, 4 tasks: [4-7] + srun: launching StepId=453374.18 on host node3, 4 tasks: [8-11] + srun: launching StepId=453374.18 on host node4, 4 tasks: [12-15] + ``` +2. The OpenAI frontend will listen for and dynamically discover workers as + they register themselves with Dynamo's distributed runtime: + ``` + 0: 2025-06-13T02:36:48.160Z INFO dynamo_run::input::http: Watching for remote model at models + 0: 2025-06-13T02:36:48.161Z INFO dynamo_llm::http::service::service_v2: Starting HTTP service on: 0.0.0.0:8000 address="0.0.0.0:8000" + ``` +3. The TRTLLM worker will consist of N (N=16 for TP16) MPI ranks, 1 rank on each + GPU on each node, which will each output their progress while loading the model. + You can see each rank's output prefixed with the rank at the start of each log line + until the model succesfully finishes loading: + ``` + 8: rank8 run mgmn worker node with mpi_world_size: 16 ... + 10: rank10 run mgmn worker node with mpi_world_size: 16 ... + 9: rank9 run mgmn worker node with mpi_world_size: 16 ... + 11: rank11 run mgmn worker node with mpi_world_size: 16 ... + ... + 15: Model init total -- 55.42s + 11: Model init total -- 55.91s + 12: Model init total -- 55.24s + ``` +4. After the model fully finishes loading on all ranks, the worker will register itself, + and the OpenAI frontend will detect it, signaled by this output: + ``` + 0: 2025-06-13T02:46:35.040Z INFO dynamo_llm::discovery::watcher: added model model_name="nvidia/DeepSeek-R1-FP4" + ``` +5. At this point, with the worker fully initialized and detected by the frontend, + it is now ready for inference. +6. For `srun_disaggregated.sh`, it follows a very similar flow, but instead launches + three srun jobs instead of two. One for frontend, one for prefill worker, + and one for decode worker. + +## Example Request + +To verify the deployed model is working, send a `curl` request: +```bash +# NOTE: $HOST assumes running on head node, but can be changed to $HEAD_NODE_IP instead. +HOST=localhost +PORT=8000 +# "model" here should match the model name returned by the /v1/models endpoint +curl -w "%{http_code}" ${HOST}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'${SERVED_MODEL_NAME}'", + "messages": [ + { + "role": "user", + "content": "Tell me a story as if we were playing dungeons and dragons." + } + ], + "stream": true, + "max_tokens": 30 +}' +``` + +## Cleanup + +To cleanup background `srun` processes launched by `srun_aggregated.sh` or +`srun_disaggregated.sh`, you can run: +```bash +pkill srun +``` + +## Known Issues + +- This example has only been tested on a 4xGB200 node setup with 16 GPUs using + FP4 weights. In theory, the example should work on alternative setups such as + H100 nodes with FP8 weights, but this hasn't been tested yet. +- WideEP configs in this directory are still being tested. A WideEP specific + example with documentation will be added once ready. +- There are known issues where WideEP workers may not cleanly shut down: + - This may lead to leftover shared memory files in `/dev/shm/moe_*`. For + now, you must manually clean these up before deploying again on the + same set of nodes. + - Similarly, there may be GPU memory left in-use after killing the `srun` + jobs. After cleaning up any leftover shared memory files as described + above, the GPU memory may slowly come back. You can run `watch nvidia-smi` + to check on this behavior. If you don't free the GPU memory before the + next deployment, you may get a CUDA OOM error while loading the model. + - There is mention of this issue in the relevant TRT-LLM blog + [here](https://github.com/NVIDIA/TensorRT-LLM/blob/6021a439ab9c29f4c46f721eeb59f6b992c425ea/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#miscellaneous). diff --git a/examples/tensorrt_llm/multinode/slurm_launch/srun_aggregated.sh b/examples/tensorrt_llm/multinode/slurm_launch/srun_aggregated.sh new file mode 100644 index 0000000000..a66c330cda --- /dev/null +++ b/examples/tensorrt_llm/multinode/slurm_launch/srun_aggregated.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# This is one of the only variables that must be set currently, most of the rest may +# just work out of the box if following the steps in the README. +IMAGE="${IMAGE:-""}" + +# Set to mount current host directory to /mnt inside the container as an example, +# but you may freely customize the mounts based on your cluster. A common practice +# is to mount paths to NFS storage for common scripts, model weights, etc. +# NOTE: This can be a comma separated list of multiple mounts as well. +DEFAULT_MOUNT="${PWD}/../../:/mnt" +MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" + +# Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes. +# For 8xH100 nodes as an example, you may set this to 2 nodes x 8 gpus/node instead. +NUM_NODES=${NUM_NODES:-4} +NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} + +export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml}" + +# Automate settings of certain variables for convenience, but you are free +# to manually set these for more control as well. +ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)" +export HEAD_NODE="${SLURMD_NODENAME}" +export HEAD_NODE_IP="$(hostname -i)" +export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" +export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" + +if [[ -z ${IMAGE} ]]; then + echo "ERROR: You need to set the IMAGE environment variable to the " \ + "Dynamo+TRTLLM docker image or .sqsh file from 'enroot import' " \ + "See how to build one from source here: " \ + "https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker" + exit 1 +fi + +# NOTE: Output streamed to stdout for ease of understanding the example, but +# in practice you would probably set `srun --output ... --error ...` to pipe +# the stdout/stderr to files. +echo "Launching frontend services in background." +srun \ + --overlap \ + --container-image "${IMAGE}" \ + --container-mounts "${MOUNTS}" \ + --verbose \ + --label \ + -A "${ACCOUNT}" \ + -J "${ACCOUNT}-dynamo.trtllm" \ + --nodelist "${HEAD_NODE}" \ + --nodes 1 \ + --jobid "${SLURM_JOB_ID}" \ + /mnt/start_frontend_services.sh & + +# NOTE: Output streamed to stdout for ease of understanding the example, but +# in practice you would probably set `srun --output ... --error ...` to pipe +# the stdout/stderr to files. +echo "Launching multi-node worker in background." +DISAGGREGATION_MODE="prefill_and_decode" \ +srun \ + --mpi pmix \ + --oversubscribe \ + --container-image "${IMAGE}" \ + --container-mounts "${MOUNTS}" \ + --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,ENGINE_CONFIG \ + --verbose \ + --label \ + -A "${ACCOUNT}" \ + -J "${ACCOUNT}-dynamo.trtllm" \ + --nodes "${NUM_NODES}" \ + --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ + --jobid "${SLURM_JOB_ID}" \ + /mnt/start_trtllm_worker.sh & \ No newline at end of file diff --git a/examples/tensorrt_llm/multinode/slurm_launch/srun_disaggregated.sh b/examples/tensorrt_llm/multinode/slurm_launch/srun_disaggregated.sh new file mode 100644 index 0000000000..155cb3254e --- /dev/null +++ b/examples/tensorrt_llm/multinode/slurm_launch/srun_disaggregated.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# This is one of the only variables that must be set currently, most of the rest may +# just work out of the box if following the steps in the README. +IMAGE="${IMAGE:-""}" + +# Set to mount current host directory to /mnt inside the container as an example, +# but you may freely customize the mounts based on your cluster. A common practice +# is to mount paths to NFS storage for common scripts, model weights, etc. +# NOTE: This can be a comma separated list of multiple mounts as well. +DEFAULT_MOUNT="${PWD}:/mnt" +MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" + +NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} + +NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4} +PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/wide_ep_prefill.yaml}" + +NUM_DECODE_NODES=${NUM_DECODE_NODES:-4} +DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/wide_ep_decode.yaml}" + +DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} + +# Automate settings of certain variables for convenience, but you are free +# to manually set these for more control as well. +ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)" +export HEAD_NODE="${SLURMD_NODENAME}" +export HEAD_NODE_IP="$(hostname -i)" +export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" +export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" + +if [[ -z ${IMAGE} ]]; then + echo "ERROR: You need to set the IMAGE environment variable to the " \ + "Dynamo+TRTLLM docker image or .sqsh file from 'enroot import' " \ + "See how to build one from source here: " \ + "https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker" + exit 1 +fi + +# NOTE: Output streamed to stdout for ease of understanding the example, but +# in practice you would probably set `srun --output ... --error ...` to pipe +# the stdout/stderr to files. +echo "Launching frontend services in background." +srun \ + --overlap \ + --container-image "${IMAGE}" \ + --container-mounts "${MOUNTS}" \ + --verbose \ + --label \ + -A "${ACCOUNT}" \ + -J "${ACCOUNT}-dynamo.trtllm" \ + --nodelist "${HEAD_NODE}" \ + --nodes 1 \ + --jobid "${SLURM_JOB_ID}" \ + /mnt/start_frontend_services.sh & + +# NOTE: Output streamed to stdout for ease of understanding the example, but +# in practice you would probably set `srun --output ... --error ...` to pipe +# the stdout/stderr to files. +echo "Launching multi-node prefill worker in background." +DISAGGREGATION_MODE=prefill \ +ENGINE_CONFIG=${PREFILL_ENGINE_CONFIG} \ +srun \ + --mpi pmix \ + --oversubscribe \ + --container-image "${IMAGE}" \ + --container-mounts "${MOUNTS}" \ + --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \ + --verbose \ + --label \ + -A "${ACCOUNT}" \ + -J "${ACCOUNT}-dynamo.trtllm" \ + --nodes "${NUM_PREFILL_NODES}" \ + --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ + --jobid "${SLURM_JOB_ID}" \ + /mnt/start_trtllm_worker.sh & + +echo "Launching multi-node decode worker in background." +DISAGGREGATION_MODE=decode \ +ENGINE_CONFIG=${DECODE_ENGINE_CONFIG} \ +srun \ + --mpi pmix \ + --oversubscribe \ + --container-image "${IMAGE}" \ + --container-mounts "${MOUNTS}" \ + --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \ + --verbose \ + --label \ + -A "${ACCOUNT}" \ + -J "${ACCOUNT}-dynamo.trtllm" \ + --nodes "${NUM_DECODE_NODES}" \ + --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ + --jobid "${SLURM_JOB_ID}" \ + /mnt/start_trtllm_worker.sh & \ No newline at end of file diff --git a/examples/tensorrt_llm/multinode/slurm_launch/start_frontend_services.sh b/examples/tensorrt_llm/multinode/slurm_launch/start_frontend_services.sh new file mode 100644 index 0000000000..0d1b588904 --- /dev/null +++ b/examples/tensorrt_llm/multinode/slurm_launch/start_frontend_services.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Start NATS +nats-server -js & + +# Start etcd +etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0.0.0:2379 --data-dir /tmp/etcd & + +# Wait for NATS/etcd to startup +sleep 3 + +# Start OpenAI Frontend which will dynamically discover workers when they startup +# NOTE: This is a blocking call. +dynamo-run in=http out=dyn --http-port 8000 diff --git a/examples/tensorrt_llm/multinode/slurm_launch/start_trtllm_worker.sh b/examples/tensorrt_llm/multinode/slurm_launch/start_trtllm_worker.sh new file mode 100644 index 0000000000..0806a513c0 --- /dev/null +++ b/examples/tensorrt_llm/multinode/slurm_launch/start_trtllm_worker.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +if [[ -z ${MODEL_PATH} ]]; then + echo "ERROR: MODEL_PATH was not set." + echo "ERROR: MODEL_PATH must be set to either the HuggingFace ID or locally " \ + "downloaded path to the model weights. Since Deepseek R1 is large, it is " \ + "recommended to pre-download them to a shared location and provide the path." + exit 1 +fi + +if [[ -z ${SERVED_MODEL_NAME} ]]; then + echo "WARNING: SERVED_MODEL_NAME was not set. It will be derived from MODEL_PATH." +fi + + + +if [[ -z ${ENGINE_CONFIG} ]]; then + echo "ERROR: ENGINE_CONFIG was not set." + echo "ERROR: ENGINE_CONFIG must be set to a valid Dynamo+TRTLLM engine config file." + exit 1 +fi + +EXTRA_ARGS="" +if [[ -n ${DISAGGREGATION_MODE} ]]; then + EXTRA_ARGS+="--disaggregation-mode ${DISAGGREGATION_MODE}" +fi + +if [[ -n ${DISAGGREGATION_STRATEGY} ]]; then + EXTRA_ARGS+="--disaggregation-strategy ${DISAGGREGATION_STRATEGY}" +fi + +trtllm-llmapi-launch \ + python3 /workspace/examples/tensorrt_llm/components/worker.py \ + --model-path "${MODEL_PATH}" \ + --model-name "${SERVED_MODEL_NAME}" \ + --extra-engine-args "${ENGINE_CONFIG}" \ + ${EXTRA_ARGS} \ No newline at end of file diff --git a/examples/tensorrt_llm/utils/get_env_vars.sh b/examples/tensorrt_llm/utils/get_env_vars.sh new file mode 100644 index 0000000000..bd67e180a9 --- /dev/null +++ b/examples/tensorrt_llm/utils/get_env_vars.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Helper script to generate environment variables for each node during a multinode SGLang deployment + +echo "=== USAGE ===" +echo "1. Run this script: ./gen_env_vars.sh" +echo "2. Enter the IP addresses when prompted" +echo "3. Copy the commands for the head prefill node and run them" +echo "4. Copy the commands for all other nodes and run them on each node" +echo "5. Proceed with starting your prefill and decode workers" +echo "" + +# Prompt for IP addresses +read -p "Enter HEAD_PREFILL_NODE IP: " HEAD_PREFILL_NODE +read -p "Enter HEAD_DECODE_NODE IP: " HEAD_DECODE_NODE + +# Validate inputs +if [ -z "$HEAD_PREFILL_NODE" ] || [ -z "$HEAD_DECODE_NODE" ]; then + echo "Error: Both IP addresses are required" + exit 1 +fi + +echo "=== HEAD PREFILL NODE ($HEAD_PREFILL_NODE) ===" +echo "Run all of these commands on the head prefill node:" +echo "" +echo "nats-server -js &" +echo "etcd --listen-client-urls http://0.0.0.0:2379 \\" +echo " --advertise-client-urls http://0.0.0.0:2379 \\" +echo " --listen-peer-urls http://0.0.0.0:2380 \\" +echo " --initial-cluster default=http://$HEAD_PREFILL_NODE:2380 &" +echo "export HEAD_PREFILL_NODE_IP=$HEAD_PREFILL_NODE" +echo "export HEAD_DECODE_NODE_IP=$HEAD_DECODE_NODE" +echo "" +echo "=== ALL OTHER NODES ===" +echo "Run these commands on all other nodes (prefill and decode):" +echo "" +echo "# Export environment variables" +echo "export NATS_SERVER=nats://$HEAD_PREFILL_NODE:4222" +echo "export ETCD_ENDPOINTS=http://$HEAD_PREFILL_NODE:2379" +echo "export HEAD_PREFILL_NODE_IP=$HEAD_PREFILL_NODE" +echo "export HEAD_DECODE_NODE_IP=$HEAD_DECODE_NODE" + + + diff --git a/examples/tensorrt_llm/utils/request_handlers/handler_base.py b/examples/tensorrt_llm/utils/request_handlers/handler_base.py index 0dc3bb76b4..8f7fc35676 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handler_base.py +++ b/examples/tensorrt_llm/utils/request_handlers/handler_base.py @@ -59,7 +59,9 @@ def check_error(self, result: dict): if self.disaggregation_mode == "prefill": return result["finish_reason"] == "error" else: - return result["finish_reason"] == "stop" or result["finish_reason"] == "error" + return ( + result["finish_reason"] == "stop" or result["finish_reason"] == "error" + ) async def generate_locally(self, request: dict): """ diff --git a/examples/tensorrt_llm/utils/request_handlers/handlers.py b/examples/tensorrt_llm/utils/request_handlers/handlers.py index 4498960e61..338471c424 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handlers.py +++ b/examples/tensorrt_llm/utils/request_handlers/handlers.py @@ -96,7 +96,9 @@ async def generate(self, request: dict): if response_count > 1: raise ValueError("Prefill response should be generated only once.") - if self.disaggregation_strategy == "prefill_first" and not self.check_error(prefill_response): + if self.disaggregation_strategy == "prefill_first" and not self.check_error( + prefill_response + ): # If operating under prefill_first strategy, the prefill handler needs to trigger # the decode handler. request["disaggregated_params"] = prefill_response["disaggregated_params"] @@ -134,8 +136,9 @@ async def generate(self, request: dict): if self.check_error(prefill_response.data()): yield prefill_response.data() return - request["disaggregated_params"] = prefill_response.data()["disaggregated_params"] - + request["disaggregated_params"] = prefill_response.data()[ + "disaggregated_params" + ] async for res in self.generate_locally(request): yield res From 72ad9422872c7bb870319f9b600ee1c2bdb5612e Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 15:52:43 -0700 Subject: [PATCH 03/22] Fix perm --- examples/tensorrt_llm/multinode/multinode-examples.md | 0 .../multinode/{slurm_launch => }/srun_aggregated.sh | 6 +++--- .../multinode/{slurm_launch => }/srun_disaggregated.sh | 0 .../multinode/{slurm_launch => }/start_frontend_services.sh | 0 .../multinode/{slurm_launch => }/start_trtllm_worker.sh | 0 5 files changed, 3 insertions(+), 3 deletions(-) mode change 100644 => 100755 examples/tensorrt_llm/multinode/multinode-examples.md rename examples/tensorrt_llm/multinode/{slurm_launch => }/srun_aggregated.sh (95%) mode change 100644 => 100755 rename examples/tensorrt_llm/multinode/{slurm_launch => }/srun_disaggregated.sh (100%) mode change 100644 => 100755 rename examples/tensorrt_llm/multinode/{slurm_launch => }/start_frontend_services.sh (100%) mode change 100644 => 100755 rename examples/tensorrt_llm/multinode/{slurm_launch => }/start_trtllm_worker.sh (100%) mode change 100644 => 100755 diff --git a/examples/tensorrt_llm/multinode/multinode-examples.md b/examples/tensorrt_llm/multinode/multinode-examples.md old mode 100644 new mode 100755 diff --git a/examples/tensorrt_llm/multinode/slurm_launch/srun_aggregated.sh b/examples/tensorrt_llm/multinode/srun_aggregated.sh old mode 100644 new mode 100755 similarity index 95% rename from examples/tensorrt_llm/multinode/slurm_launch/srun_aggregated.sh rename to examples/tensorrt_llm/multinode/srun_aggregated.sh index a66c330cda..748a5cbaf3 --- a/examples/tensorrt_llm/multinode/slurm_launch/srun_aggregated.sh +++ b/examples/tensorrt_llm/multinode/srun_aggregated.sh @@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}" # but you may freely customize the mounts based on your cluster. A common practice # is to mount paths to NFS storage for common scripts, model weights, etc. # NOTE: This can be a comma separated list of multiple mounts as well. -DEFAULT_MOUNT="${PWD}/../../:/mnt" +DEFAULT_MOUNT="${PWD}/../:/mnt" MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" # Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes. @@ -51,7 +51,7 @@ srun \ --nodelist "${HEAD_NODE}" \ --nodes 1 \ --jobid "${SLURM_JOB_ID}" \ - /mnt/start_frontend_services.sh & + /mnt/multinode/start_frontend_services.sh & # NOTE: Output streamed to stdout for ease of understanding the example, but # in practice you would probably set `srun --output ... --error ...` to pipe @@ -71,4 +71,4 @@ srun \ --nodes "${NUM_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/start_trtllm_worker.sh & \ No newline at end of file + /mnt/multinode/start_trtllm_worker.sh & \ No newline at end of file diff --git a/examples/tensorrt_llm/multinode/slurm_launch/srun_disaggregated.sh b/examples/tensorrt_llm/multinode/srun_disaggregated.sh old mode 100644 new mode 100755 similarity index 100% rename from examples/tensorrt_llm/multinode/slurm_launch/srun_disaggregated.sh rename to examples/tensorrt_llm/multinode/srun_disaggregated.sh diff --git a/examples/tensorrt_llm/multinode/slurm_launch/start_frontend_services.sh b/examples/tensorrt_llm/multinode/start_frontend_services.sh old mode 100644 new mode 100755 similarity index 100% rename from examples/tensorrt_llm/multinode/slurm_launch/start_frontend_services.sh rename to examples/tensorrt_llm/multinode/start_frontend_services.sh diff --git a/examples/tensorrt_llm/multinode/slurm_launch/start_trtllm_worker.sh b/examples/tensorrt_llm/multinode/start_trtllm_worker.sh old mode 100644 new mode 100755 similarity index 100% rename from examples/tensorrt_llm/multinode/slurm_launch/start_trtllm_worker.sh rename to examples/tensorrt_llm/multinode/start_trtllm_worker.sh From 1e8d0a88ed15c9c7f8239e70ce38018dd96bd509 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 16:01:49 -0700 Subject: [PATCH 04/22] Use correct component --- examples/tensorrt_llm/multinode/multinode-examples.md | 4 ++-- examples/tensorrt_llm/multinode/start_trtllm_worker.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/tensorrt_llm/multinode/multinode-examples.md b/examples/tensorrt_llm/multinode/multinode-examples.md index bf48bf623a..508ccb1490 100755 --- a/examples/tensorrt_llm/multinode/multinode-examples.md +++ b/examples/tensorrt_llm/multinode/multinode-examples.md @@ -106,8 +106,8 @@ export IMAGE="" # For example, assuming your cluster had a `/lustre` directory on the host, you # could add that as a mount like so: # -# export MOUNTS="${PWD}:/mnt,/lustre:/lustre" -export MOUNTS="${PWD}:/mnt" +# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre" +export MOUNTS="${PWD}/../:/mnt" # NOTE: In general, Deepseek R1 is very large, so it is recommended to # pre-download the model weights and save them in some shared location, diff --git a/examples/tensorrt_llm/multinode/start_trtllm_worker.sh b/examples/tensorrt_llm/multinode/start_trtllm_worker.sh index 0806a513c0..4c77432fb9 100755 --- a/examples/tensorrt_llm/multinode/start_trtllm_worker.sh +++ b/examples/tensorrt_llm/multinode/start_trtllm_worker.sh @@ -32,8 +32,8 @@ if [[ -n ${DISAGGREGATION_STRATEGY} ]]; then fi trtllm-llmapi-launch \ - python3 /workspace/examples/tensorrt_llm/components/worker.py \ + python3 /mnt/components/worker.py \ --model-path "${MODEL_PATH}" \ - --model-name "${SERVED_MODEL_NAME}" \ + --served-model-name "${SERVED_MODEL_NAME}" \ --extra-engine-args "${ENGINE_CONFIG}" \ ${EXTRA_ARGS} \ No newline at end of file From 8fed44a33c9ec977e8ff1d769497f640e3e354da Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 16:03:41 -0700 Subject: [PATCH 05/22] Fix the path --- .../engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml index 84e35930de..d1ceb6b5ab 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml @@ -10,7 +10,7 @@ moe_backend: WideEP # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size # 4096 = 256 * 16 # moe_max_num_tokens: 4096 -moe_load_balancer: /mnt/engine_configs/eplb.yaml +moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml tensor_parallel_size: 16 moe_expert_parallel_size: 16 From 15f2cab01627d136e9acb4fb77686372b1f9f42d Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 16:18:48 -0700 Subject: [PATCH 06/22] Fix --- .../deepseek_r1/mtp/decode.yaml | 0 .../mtp/{agg.yaml => mtp_agg.yaml} | 0 .../deepseek_r1/mtp/mtp_decode.yaml | 53 +++++++++++++++++++ .../deepseek_r1/mtp/mtp_prefill.yaml | 37 +++++++++++++ .../deepseek_r1/mtp/prefill.yaml | 0 .../deepseek_r1/{ => simple}/agg.yaml | 0 .../deepseek_r1/{ => simple}/decode.yaml | 0 .../deepseek_r1/{ => simple}/prefill.yaml | 0 .../deepseek_r1/wide_ep/wide_ep_decode.yaml | 2 +- .../deepseek_r1/wide_ep/wide_ep_prefill.yaml | 41 ++++++++++++++ .../multinode/multinode-examples.md | 6 +-- 11 files changed, 135 insertions(+), 4 deletions(-) delete mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/decode.yaml rename examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/{agg.yaml => mtp_agg.yaml} (100%) create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml create mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml delete mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/prefill.yaml rename examples/tensorrt_llm/engine_configs/deepseek_r1/{ => simple}/agg.yaml (100%) rename examples/tensorrt_llm/engine_configs/deepseek_r1/{ => simple}/decode.yaml (100%) rename examples/tensorrt_llm/engine_configs/deepseek_r1/{ => simple}/prefill.yaml (100%) diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/decode.yaml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml similarity index 100% rename from examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/agg.yaml rename to examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml new file mode 100644 index 0000000000..bfd4cde207 --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: FP4 only supported starting with Blackwell GPUs. +# https://huggingface.co/nvidia/DeepSeek-R1-FP4 +# You can also specify the full path to locally downloaded weights +# instead of a HuggingFace ID here. + +backend: pytorch +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +enable_attention_dp: false +max_batch_size: 256 +# Note: When MPT is enabled and `cuda_graph_batch_sizes` is specified, `max_num_tokens` must satisfy the following formula: +# max_num_tokens >= max(cuda_graph_batch_sizes) * (num_nextn_predict_layers + 1) +# This is a known issue in TensorRT-LLM and will be resolved in the next release. +max_num_tokens: 512 +# 8704 = 8192 ISL + 512 OSL +max_seq_len: 8704 +kv_cache_config: + free_gpu_memory_fraction: 0.85 + +# Enable the MTP(Multi-Token Prediction) in decode model engine +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +use_cuda_graph: true +cuda_graph_padding_enabled: true +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +print_iter_log: true +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml new file mode 100644 index 0000000000..870a3f48d1 --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: FP4 only supported starting with Blackwell GPUs. +# https://huggingface.co/nvidia/DeepSeek-R1-FP4 +# You can also specify the full path to locally downloaded weights +# instead of a HuggingFace ID here. + +backend: pytorch +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +enable_attention_dp: true +max_batch_size: 1 +max_num_tokens: 8192 +max_seq_len: 8192 +kv_cache_config: + free_gpu_memory_fraction: 0.75 +print_iter_log: true +kv_cache_dtype: fp8 +disable_overlap_scheduler: true + +# Enable the MTP(Multi-Token Prediction) in the prefill model engine +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/prefill.yaml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/agg.yaml similarity index 100% rename from examples/tensorrt_llm/engine_configs/deepseek_r1/agg.yaml rename to examples/tensorrt_llm/engine_configs/deepseek_r1/simple/agg.yaml diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/decode.yaml similarity index 100% rename from examples/tensorrt_llm/engine_configs/deepseek_r1/decode.yaml rename to examples/tensorrt_llm/engine_configs/deepseek_r1/simple/decode.yaml diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/prefill.yaml similarity index 100% rename from examples/tensorrt_llm/engine_configs/deepseek_r1/prefill.yaml rename to examples/tensorrt_llm/engine_configs/deepseek_r1/simple/prefill.yaml diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml index dc43788f94..ad3d5df0bf 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml @@ -16,7 +16,7 @@ backend: pytorch # WideEP related settings moe_backend: WideEP -moe_load_balancer: /mnt/engine_configs/eplb.yaml +moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml index e69de29bb2..d750d3a2aa 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +backend: pytorch + +# WideEP related settings +moe_backend: WideEP +moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml + +# TP/EP/PP/DP +tensor_parallel_size: 16 +moe_expert_parallel_size: 16 +pipeline_parallel_size: 1 +enable_attention_dp: true + +max_batch_size: 1 +max_num_tokens: 8192 +max_seq_len: 8192 + +kv_cache_config: + free_gpu_memory_fraction: 0.75 + +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 +disable_overlap_scheduler: true +print_iter_log: true +# NOTE: This dtype must match in both prefill/decode configs +kv_cache_dtype: fp8 \ No newline at end of file diff --git a/examples/tensorrt_llm/multinode/multinode-examples.md b/examples/tensorrt_llm/multinode/multinode-examples.md index 508ccb1490..60162aafad 100755 --- a/examples/tensorrt_llm/multinode/multinode-examples.md +++ b/examples/tensorrt_llm/multinode/multinode-examples.md @@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes: ```bash # Default set in srun_aggregated.sh, but can customize here. -# export ENGINE_CONFIG="/mnt/engine_configs/wide_ep_agg.yaml" +# export ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml" # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of @@ -165,8 +165,8 @@ deployment across 8 nodes: ```bash # Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/wide_ep_prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/wide_ep_decode.yaml" +# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml" # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG From e04815dc221e3bdc715b32ef4aee4faf53442275 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 16:24:22 -0700 Subject: [PATCH 07/22] Fix disagg path --- examples/tensorrt_llm/multinode/srun_disaggregated.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/tensorrt_llm/multinode/srun_disaggregated.sh b/examples/tensorrt_llm/multinode/srun_disaggregated.sh index 155cb3254e..d1928d76c8 100755 --- a/examples/tensorrt_llm/multinode/srun_disaggregated.sh +++ b/examples/tensorrt_llm/multinode/srun_disaggregated.sh @@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}" # but you may freely customize the mounts based on your cluster. A common practice # is to mount paths to NFS storage for common scripts, model weights, etc. # NOTE: This can be a comma separated list of multiple mounts as well. -DEFAULT_MOUNT="${PWD}:/mnt" +DEFAULT_MOUNT="${PWD}/../:/mnt" MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} @@ -75,7 +75,7 @@ srun \ --nodes "${NUM_PREFILL_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/start_trtllm_worker.sh & + /mnt/multinode/start_trtllm_worker.sh & echo "Launching multi-node decode worker in background." DISAGGREGATION_MODE=decode \ @@ -93,4 +93,4 @@ srun \ --nodes "${NUM_DECODE_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/start_trtllm_worker.sh & \ No newline at end of file + /mnt/multinode/start_trtllm_worker.sh & \ No newline at end of file From 0192e608ebef193cb5d79d7eb4c8ab8aded51f61 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 16:29:00 -0700 Subject: [PATCH 08/22] Fix it --- examples/tensorrt_llm/multinode/srun_disaggregated.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tensorrt_llm/multinode/srun_disaggregated.sh b/examples/tensorrt_llm/multinode/srun_disaggregated.sh index d1928d76c8..316771a3bc 100755 --- a/examples/tensorrt_llm/multinode/srun_disaggregated.sh +++ b/examples/tensorrt_llm/multinode/srun_disaggregated.sh @@ -54,7 +54,7 @@ srun \ --nodelist "${HEAD_NODE}" \ --nodes 1 \ --jobid "${SLURM_JOB_ID}" \ - /mnt/start_frontend_services.sh & + /mnt/multinode/start_frontend_services.sh & # NOTE: Output streamed to stdout for ease of understanding the example, but # in practice you would probably set `srun --output ... --error ...` to pipe From 0c2c8cc69162a89248251d55922fd01c00f0532b Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 16:30:40 -0700 Subject: [PATCH 09/22] Fix paths --- examples/tensorrt_llm/multinode/srun_disaggregated.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tensorrt_llm/multinode/srun_disaggregated.sh b/examples/tensorrt_llm/multinode/srun_disaggregated.sh index 316771a3bc..db422ed5f9 100755 --- a/examples/tensorrt_llm/multinode/srun_disaggregated.sh +++ b/examples/tensorrt_llm/multinode/srun_disaggregated.sh @@ -16,10 +16,10 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4} -PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/wide_ep_prefill.yaml}" +PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml}" NUM_DECODE_NODES=${NUM_DECODE_NODES:-4} -DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/wide_ep_decode.yaml}" +DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml}" DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} From 13514f9fcea613ec7a7b4c27d3ff01afa19d8f7e Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 16:38:32 -0700 Subject: [PATCH 10/22] Fix disagg --- examples/tensorrt_llm/multinode/start_trtllm_worker.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/tensorrt_llm/multinode/start_trtllm_worker.sh b/examples/tensorrt_llm/multinode/start_trtllm_worker.sh index 4c77432fb9..ac7cd1b120 100755 --- a/examples/tensorrt_llm/multinode/start_trtllm_worker.sh +++ b/examples/tensorrt_llm/multinode/start_trtllm_worker.sh @@ -22,6 +22,13 @@ if [[ -z ${ENGINE_CONFIG} ]]; then exit 1 fi +# NOTE: When this script is run directly from srun, the environment variables +# for TRTLLM KV cache are not set. So we need to set them here. +# Related issue: https://github.com/ai-dynamo/dynamo/issues/1743 +if [[ -z ${TRTLLM_USE_UCX_KVCACHE} ]] && [[ -z ${TRTLLM_USE_NIXL_KVCACHE} ]]; then + export TRTLLM_USE_UCX_KVCACHE=1 +fi + EXTRA_ARGS="" if [[ -n ${DISAGGREGATION_MODE} ]]; then EXTRA_ARGS+="--disaggregation-mode ${DISAGGREGATION_MODE}" From 1a73ef5c24dbffe7a327a55c7aac8d993f147b71 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 11 Jul 2025 17:53:41 -0700 Subject: [PATCH 11/22] Add more documentation --- examples/tensorrt_llm/README.md | 53 ++++++++++++++----- .../tensorrt_llm/engine_configs/decode.yaml | 2 +- .../engine_configs/deepseek_r1/README.md | 19 ------- .../tensorrt_llm/engine_configs/prefill.yaml | 2 +- examples/tensorrt_llm/utils/get_env_vars.sh | 46 ---------------- 5 files changed, 42 insertions(+), 80 deletions(-) delete mode 100644 examples/tensorrt_llm/engine_configs/deepseek_r1/README.md delete mode 100644 examples/tensorrt_llm/utils/get_env_vars.sh diff --git a/examples/tensorrt_llm/README.md b/examples/tensorrt_llm/README.md index 80b3c02035..2c2b84d201 100644 --- a/examples/tensorrt_llm/README.md +++ b/examples/tensorrt_llm/README.md @@ -27,11 +27,11 @@ This directory contains examples and reference implementations for deploying Lar - [Build docker](#build-docker) - [Run container](#run-container) - [Run deployment](#run-deployment) - - [Single Node deployment](#single-node-example-architectures) + - [Single Node deployment](#single-node-deployments) - [Multinode deployment](#multinode-deployment) - [Client](#client) - [Benchmarking](#benchmarking) - - [Close Deployment](#close-deployment) +- [Disaggregation Strategy](#disaggregation-strategy) - [KV Cache Transfer](#kv-cache-transfer-in-disaggregated-serving) # Quick Start @@ -50,7 +50,7 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) ## Deployment Architectures -See [deployment architectures](../llm/README.md#deployment-architectures) to learn about the general idea of the architecture. +See [deployment architectures](../llm/README.md#deployment-architectures) to learn about the general idea of the architecture. Note: TensorRT-LLM disaggregation does not support conditional disaggregation yet. You can configure the deployment to always use either aggregate or disaggregated serving. @@ -99,8 +99,8 @@ This figure shows an overview of the major components to deploy: ``` +------+ +-----------+ +------------------+ +---------------+ -| HTTP |----->| processor |----->| Worker |------------>| Prefill | -| |<-----| |<-----| |<------------| Worker | +| HTTP |----->| processor |----->| Worker1 |------------>| Worker2 | +| |<-----| |<-----| |<------------| | +------+ +-----------+ +------------------+ +---------------+ | ^ | query best | | return | publish kv events @@ -112,10 +112,9 @@ This figure shows an overview of the major components to deploy: ``` -Note: The above architecture illustrates all the components. The final components -that get spawned depend upon the chosen graph. +**Note:** The diagram above shows all possible components in a deployment. Depending on the chosen disaggregation strategy, you can configure whether Worker1 handles prefill and Worker2 handles decode, or vice versa. For more information on how to select and configure these strategies, see the [Disaggregation Strategy](#disaggregation-strategy) section below. -### Single-Node example architectures +### Single-Node Deployments > [!IMPORTANT] > Below we provide some simple shell scripts that run the components for each configuration. Each shell script is simply running the `dynamo-run` to start up the ingress and using `python3` to start up the workers. You can easily take each commmand and run them in separate terminals. @@ -152,10 +151,28 @@ cd $DYNAMO_ROOT/examples/tensorrt_llm ./launch/disagg_router.sh ``` -### Multinode Deployment +#### Aggregated with Multi-Token Prediction (MTP) and DeepSeek R1 +```bash +cd $DYNAMO_ROOT/examples/tensorrt_llm + +export AGG_ENGINE_ARGS=./engine_configs/deepseek_r1/mtp/mtp_agg.yaml +export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" +# nvidia/DeepSeek-R1-FP4 is a large model +export MODEL_PATH="nvidia/DeepSeek-R1-FP4" +./launch/agg.sh +``` + +Notes: +- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script. + + Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit` -For details and instructions on multinode serving, please refer to the [multinode-examples.md](./multinode-examples.md) document. This guide provides step-by-step examples and configuration tips for deploying Dynamo with TensorRT-LLM across multiple nodes. +- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark. +- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates. +### Multinode Deployment + +For details and instructions on multinode serving, please refer to the [multinode-examples.md](./multinode/multinode-examples.md) document. This guide provides step-by-step examples and configuration tips for deploying Dynamo with TensorRT-LLM across multiple nodes. ### Client @@ -168,10 +185,20 @@ NOTE: To send a request to a multi-node deployment, target the node which is run To benchmark your deployment with GenAI-Perf, see this utility script, configuring the `model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh) -### Close deployment -See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) section to learn about how to close the deployment. +## Disaggregation Strategy + +The disaggregation strategy controls how requests are distributed between the prefill and decode workers in a disaggregated deployment. + +By default, Dynamo uses a `decode first` strategy: incoming requests are initially routed to the decode worker, which then forwards them to the prefill worker in round-robin fashion. The prefill worker processes the request and returns results to the decode worker for any remaining decode operations. + +When using KV routing, however, Dynamo switches to a `prefill first` strategy. In this mode, requests are routed directly to the prefill worker, which can help maximize KV cache reuse and improve overall efficiency for certain workloads. Choosing the appropriate strategy can have a significant impact on performance, depending on your use case. + +The disaggregation strategy can be set using the `DISAGGREGATION_STRATEGY` environment variable. You can set the strategy before launching your deployment, for example: +```bash +DISAGGREGATION_STRATEGY="prefill_first" ./launch/disagg.sh +``` -### KV Cache Transfer in Disaggregated Serving +## KV Cache Transfer in Disaggregated Serving Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disaggregated serving: UCX (default) and NIXL (experimental). For detailed information and configuration instructions for each method, see the [KV cache transfer guide](./kv-cache-tranfer.md). diff --git a/examples/tensorrt_llm/engine_configs/decode.yaml b/examples/tensorrt_llm/engine_configs/decode.yaml index 6492c9b9e0..eb943fd6e7 100644 --- a/examples/tensorrt_llm/engine_configs/decode.yaml +++ b/examples/tensorrt_llm/engine_configs/decode.yaml @@ -23,5 +23,5 @@ enable_chunked_prefill: true disable_overlap_scheduler: false use_cuda_graph: true kv_cache_config: - free_gpu_memory_fraction: 0.40 + free_gpu_memory_fraction: 0.95 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/README.md b/examples/tensorrt_llm/engine_configs/deepseek_r1/README.md deleted file mode 100644 index 5c7a87782c..0000000000 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/README.md +++ /dev/null @@ -1,19 +0,0 @@ - - - -This folder contains launch scripts for \ No newline at end of file diff --git a/examples/tensorrt_llm/engine_configs/prefill.yaml b/examples/tensorrt_llm/engine_configs/prefill.yaml index 0e7ad45fed..5dee9e653d 100644 --- a/examples/tensorrt_llm/engine_configs/prefill.yaml +++ b/examples/tensorrt_llm/engine_configs/prefill.yaml @@ -25,4 +25,4 @@ disable_overlap_scheduler: true use_cuda_graph: false kv_cache_config: - free_gpu_memory_fraction: 0.40 + free_gpu_memory_fraction: 0.95 diff --git a/examples/tensorrt_llm/utils/get_env_vars.sh b/examples/tensorrt_llm/utils/get_env_vars.sh deleted file mode 100644 index bd67e180a9..0000000000 --- a/examples/tensorrt_llm/utils/get_env_vars.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Helper script to generate environment variables for each node during a multinode SGLang deployment - -echo "=== USAGE ===" -echo "1. Run this script: ./gen_env_vars.sh" -echo "2. Enter the IP addresses when prompted" -echo "3. Copy the commands for the head prefill node and run them" -echo "4. Copy the commands for all other nodes and run them on each node" -echo "5. Proceed with starting your prefill and decode workers" -echo "" - -# Prompt for IP addresses -read -p "Enter HEAD_PREFILL_NODE IP: " HEAD_PREFILL_NODE -read -p "Enter HEAD_DECODE_NODE IP: " HEAD_DECODE_NODE - -# Validate inputs -if [ -z "$HEAD_PREFILL_NODE" ] || [ -z "$HEAD_DECODE_NODE" ]; then - echo "Error: Both IP addresses are required" - exit 1 -fi - -echo "=== HEAD PREFILL NODE ($HEAD_PREFILL_NODE) ===" -echo "Run all of these commands on the head prefill node:" -echo "" -echo "nats-server -js &" -echo "etcd --listen-client-urls http://0.0.0.0:2379 \\" -echo " --advertise-client-urls http://0.0.0.0:2379 \\" -echo " --listen-peer-urls http://0.0.0.0:2380 \\" -echo " --initial-cluster default=http://$HEAD_PREFILL_NODE:2380 &" -echo "export HEAD_PREFILL_NODE_IP=$HEAD_PREFILL_NODE" -echo "export HEAD_DECODE_NODE_IP=$HEAD_DECODE_NODE" -echo "" -echo "=== ALL OTHER NODES ===" -echo "Run these commands on all other nodes (prefill and decode):" -echo "" -echo "# Export environment variables" -echo "export NATS_SERVER=nats://$HEAD_PREFILL_NODE:4222" -echo "export ETCD_ENDPOINTS=http://$HEAD_PREFILL_NODE:2379" -echo "export HEAD_PREFILL_NODE_IP=$HEAD_PREFILL_NODE" -echo "export HEAD_DECODE_NODE_IP=$HEAD_DECODE_NODE" - - - From c70fc00374a46defc26d0d0b28b96c8a31fc398a Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 10:42:10 -0700 Subject: [PATCH 12/22] executable fix --- examples/tensorrt_llm/multinode/multinode-examples.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 examples/tensorrt_llm/multinode/multinode-examples.md diff --git a/examples/tensorrt_llm/multinode/multinode-examples.md b/examples/tensorrt_llm/multinode/multinode-examples.md old mode 100755 new mode 100644 From 53bfc34bc606ed0de76c787b3f87fb32a391edb9 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 14:38:07 -0700 Subject: [PATCH 13/22] More refactor and fixes --- .../tensorrt_llm/utils/clear_namespace.py | 1 - .../utils/request_handlers/handler_base.py | 42 +++++++++---- .../utils/request_handlers/handlers.py | 60 ++++++++++++------- examples/tensorrt_llm/utils/trtllm_utils.py | 34 +++++++---- 4 files changed, 92 insertions(+), 45 deletions(-) diff --git a/examples/tensorrt_llm/utils/clear_namespace.py b/examples/tensorrt_llm/utils/clear_namespace.py index e7fcc46485..8bca3a5e7a 100644 --- a/examples/tensorrt_llm/utils/clear_namespace.py +++ b/examples/tensorrt_llm/utils/clear_namespace.py @@ -1,4 +1,3 @@ -# SPDX-FileCopyrightText: Copyright (c) 2020 Atalaya Tech. Inc # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # diff --git a/examples/tensorrt_llm/utils/request_handlers/handler_base.py b/examples/tensorrt_llm/utils/request_handlers/handler_base.py index 8f7fc35676..9dca9823cf 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handler_base.py +++ b/examples/tensorrt_llm/utils/request_handlers/handler_base.py @@ -15,15 +15,30 @@ import logging from dataclasses import asdict, dataclass +from enum import Enum +from tensorrt_llm import SamplingParams from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams from utils.disagg_utils import DisaggregatedParams, DisaggregatedParamsCodec +from dynamo.llm.tensorrtllm.engine import TensorRTLLMEngine +from dynamo.llm.tensorrtllm.publisher import Publisher from dynamo.runtime.logging import configure_dynamo_logging configure_dynamo_logging() +class DisaggregationMode(Enum): + AGGREGATED = "prefill_and_decode" + PREFILL = "prefill" + DECODE = "decode" + + +class DisaggregationStrategy(Enum): + PREFILL_FIRST = "prefill_first" + DECODE_FIRST = "decode_first" + + @dataclass class RequestHandlerConfig: """ @@ -31,10 +46,12 @@ class RequestHandlerConfig: """ component: object - engine: object - default_sampling_params: object - publisher: object - disaggregation_mode: str + engine: TensorRTLLMEngine + default_sampling_params: SamplingParams + publisher: Publisher + disaggregation_mode: DisaggregationMode + disaggregation_strategy: DisaggregationStrategy + next_client: object class HandlerBase: @@ -56,7 +73,7 @@ def check_error(self, result: dict): """ Check if there is an error in the result. """ - if self.disaggregation_mode == "prefill": + if self.disaggregation_mode == DisaggregationMode.PREFILL: return result["finish_reason"] == "error" else: return ( @@ -81,7 +98,7 @@ async def generate_locally(self, request: dict): # Decode the disaggregated params from the request disaggregated_params = None - if self.disaggregation_mode == "prefill": + if self.disaggregation_mode == DisaggregationMode.PREFILL: request["stop_conditions"]["max_tokens"] = 1 disaggregated_params = LlmDisaggregatedParams(request_type="context_only") @@ -91,7 +108,10 @@ async def generate_locally(self, request: dict): ) disaggregated_params.request_type = "generation_only" - if self.disaggregation_mode == "decode" and disaggregated_params is None: + if ( + self.disaggregation_mode == DisaggregationMode.DECODE + and disaggregated_params is None + ): raise ValueError("Disaggregated params are required for decode mode") num_output_tokens_so_far = 0 @@ -109,7 +129,9 @@ async def generate_locally(self, request: dict): # TODO: Instead of True, we should use streaming from the request. # However, currently dynamo run does not send streaming in the request. - streaming = False if self.disaggregation_mode == "prefill" else True + streaming = ( + False if self.disaggregation_mode == DisaggregationMode.PREFILL else True + ) async for res in self.engine.llm.generate_async( inputs=inputs, @@ -123,7 +145,7 @@ async def generate_locally(self, request: dict): self.publisher.start() self.first_generation = False - if res.finished and self.disaggregation_mode != "prefill": + if res.finished and self.disaggregation_mode != DisaggregationMode.PREFILL: yield {"finish_reason": "stop", "token_ids": []} break @@ -138,7 +160,7 @@ async def generate_locally(self, request: dict): out["finish_reason"] = output.finish_reason if output.stop_reason: out["stop_reason"] = output.stop_reason - if self.disaggregation_mode == "prefill": + if self.disaggregation_mode == DisaggregationMode.PREFILL: # Return the disaggregated params only when operating in prefill mode. out["disaggregated_params"] = asdict( DisaggregatedParamsCodec.encode(output.disaggregated_params) diff --git a/examples/tensorrt_llm/utils/request_handlers/handlers.py b/examples/tensorrt_llm/utils/request_handlers/handlers.py index 338471c424..fe4e60655a 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handlers.py +++ b/examples/tensorrt_llm/utils/request_handlers/handlers.py @@ -1,7 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + import copy from dataclasses import dataclass -from utils.request_handlers.handler_base import HandlerBase +from utils.request_handlers.handler_base import ( + DisaggregationMode, + DisaggregationStrategy, + HandlerBase, +) @dataclass @@ -14,8 +21,8 @@ class RequestHandlerConfig: engine: object default_sampling_params: object publisher: object - disaggregation_mode: str - disaggregation_strategy: str + disaggregation_mode: DisaggregationMode + disaggregation_strategy: DisaggregationStrategy next_client: object @@ -28,23 +35,27 @@ def __init__(self): } def _validate_config(self, config: RequestHandlerConfig): - if config.disaggregation_mode not in self.handlers: - raise ValueError( - f"Invalid disaggregation_mode '{config.disaggregation_mode}'. " - f"Supported modes: {list(self.handlers.keys())}" - ) + mode_value = ( + config.disaggregation_mode.value + if hasattr(config.disaggregation_mode, "value") + else str(config.disaggregation_mode) + ) + if mode_value not in self.handlers: + raise ValueError(f"Invalid disaggregation_mode '{mode_value}'") if not config.next_client: if ( - config.disaggregation_mode == "prefill" - and config.disaggregation_strategy == "prefill_first" + config.disaggregation_mode == DisaggregationMode.PREFILL + and config.disaggregation_strategy + == DisaggregationStrategy.PREFILL_FIRST ): raise ValueError( "Next client is required for the main worker when disaggregation_mode='prefill' and disaggregation_strategy='prefill_first'." ) if ( - config.disaggregation_mode == "decode" - and config.disaggregation_strategy == "decode_first" + config.disaggregation_mode == DisaggregationMode.DECODE + and config.disaggregation_strategy + == DisaggregationStrategy.DECODE_FIRST ): raise ValueError( "Next client is required for the decode worker when disaggregation_mode='decode' and disaggregation_strategy='decode_first'." @@ -52,7 +63,7 @@ def _validate_config(self, config: RequestHandlerConfig): def get_request_handler(self, config: RequestHandlerConfig) -> HandlerBase: self._validate_config(config) - return self.handlers[config.disaggregation_mode](config) + return self.handlers[config.disaggregation_mode.value](config) def get_request_handler(config: RequestHandlerConfig) -> HandlerBase: @@ -96,12 +107,16 @@ async def generate(self, request: dict): if response_count > 1: raise ValueError("Prefill response should be generated only once.") - if self.disaggregation_strategy == "prefill_first" and not self.check_error( - prefill_response + if ( + self.disaggregation_strategy == DisaggregationStrategy.PREFILL_FIRST + and not self.check_error(prefill_response) ): # If operating under prefill_first strategy, the prefill handler needs to trigger # the decode handler. - request["disaggregated_params"] = prefill_response["disaggregated_params"] + if prefill_response is not None: + request["disaggregated_params"] = prefill_response[ + "disaggregated_params" + ] async for res in self.remote_decode(request): yield res else: @@ -122,7 +137,7 @@ async def remote_prefill(self, request: dict): yield res async def generate(self, request: dict): - if self.disaggregation_strategy == "decode_first": + if self.disaggregation_strategy == DisaggregationStrategy.DECODE_FIRST: prefill_response = None # If operating under decode_first strategy, the decode handler needs to trigger # the prefill handler. @@ -133,12 +148,15 @@ async def generate(self, request: dict): if response_count > 1: raise ValueError("Prefill response should be generated only once.") - if self.check_error(prefill_response.data()): + if prefill_response is not None and self.check_error( + prefill_response.data() + ): yield prefill_response.data() return - request["disaggregated_params"] = prefill_response.data()[ - "disaggregated_params" - ] + if prefill_response is not None: + request["disaggregated_params"] = prefill_response.data()[ + "disaggregated_params" + ] async for res in self.generate_locally(request): yield res diff --git a/examples/tensorrt_llm/utils/trtllm_utils.py b/examples/tensorrt_llm/utils/trtllm_utils.py index 740210106f..9dbd4f0118 100644 --- a/examples/tensorrt_llm/utils/trtllm_utils.py +++ b/examples/tensorrt_llm/utils/trtllm_utils.py @@ -1,12 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + import argparse from typing import Optional +from utils.request_handlers.handler_base import ( + DisaggregationMode, + DisaggregationStrategy, +) + # Default endpoint for the next worker. DEFAULT_ENDPOINT = "dyn://dynamo.tensorrt_llm.generate" DEFAULT_MODEL_PATH = "TinyLlama-1.1B-Instruct" DEFAULT_NEXT_ENDPOINT = "dyn://dynamo.tensorrt_llm_next.generate" -DEFAULT_DISAGGREGATION_STRATEGY = "decode_first" -DEFAULT_DISAGGREGATION_MODE = "prefill_and_decode" +DEFAULT_DISAGGREGATION_STRATEGY = DisaggregationStrategy.DECODE_FIRST +DEFAULT_DISAGGREGATION_MODE = DisaggregationMode.AGGREGATED class Config: @@ -21,8 +29,8 @@ class Config: kv_block_size: int extra_engine_args: str publish_events_and_metrics: bool - disaggregation_mode: str - disaggregation_strategy: str + disaggregation_mode: DisaggregationMode + disaggregation_strategy: DisaggregationStrategy next_endpoint: str def __str__(self) -> str: @@ -46,16 +54,16 @@ def is_first_worker(config): """ Check if the current worker is the first worker in the disaggregation chain. """ - is_primary_worker = config.disaggregation_mode == "prefill_and_decode" + is_primary_worker = config.disaggregation_mode == DisaggregationMode.AGGREGATED if not is_primary_worker: - is_primary_worker = (config.disaggregation_strategy == "prefill_first") and ( - config.disaggregation_mode == "prefill" - ) + is_primary_worker = ( + config.disaggregation_strategy == DisaggregationStrategy.PREFILL_FIRST + ) and (config.disaggregation_mode == DisaggregationMode.PREFILL) if not is_primary_worker: - is_primary_worker = (config.disaggregation_strategy == "decode_first") and ( - config.disaggregation_mode == "decode" - ) + is_primary_worker = ( + config.disaggregation_strategy == DisaggregationStrategy.DECODE_FIRST + ) and (config.disaggregation_mode == DisaggregationMode.DECODE) return is_primary_worker @@ -68,8 +76,8 @@ def parse_endpoint(endpoint: str) -> tuple[str, str, str]: f"Invalid endpoint format: '{endpoint}'. " "Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'." ) - - return tuple(endpoint_parts) + namespace, component, endpoint_name = endpoint_parts + return namespace, component, endpoint_name def cmd_line_args(): From f66eb729a6d46c435254a4c0aee5b83598f24289 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 15:28:23 -0700 Subject: [PATCH 14/22] Fix the arg parsing --- examples/tensorrt_llm/launch/disagg.sh | 4 +-- examples/tensorrt_llm/launch/disagg_router.sh | 4 +-- .../utils/request_handlers/handlers.py | 11 +++--- examples/tensorrt_llm/utils/trtllm_utils.py | 34 +++++++++++-------- 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/examples/tensorrt_llm/launch/disagg.sh b/examples/tensorrt_llm/launch/disagg.sh index d4952838cb..74a1df3b70 100755 --- a/examples/tensorrt_llm/launch/disagg.sh +++ b/examples/tensorrt_llm/launch/disagg.sh @@ -6,8 +6,8 @@ export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"configs/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"configs/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} diff --git a/examples/tensorrt_llm/launch/disagg_router.sh b/examples/tensorrt_llm/launch/disagg_router.sh index a373a8f6bd..4fb20dff07 100755 --- a/examples/tensorrt_llm/launch/disagg_router.sh +++ b/examples/tensorrt_llm/launch/disagg_router.sh @@ -6,8 +6,8 @@ export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"configs/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"configs/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} diff --git a/examples/tensorrt_llm/utils/request_handlers/handlers.py b/examples/tensorrt_llm/utils/request_handlers/handlers.py index fe4e60655a..96ff37f781 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handlers.py +++ b/examples/tensorrt_llm/utils/request_handlers/handlers.py @@ -35,13 +35,10 @@ def __init__(self): } def _validate_config(self, config: RequestHandlerConfig): - mode_value = ( - config.disaggregation_mode.value - if hasattr(config.disaggregation_mode, "value") - else str(config.disaggregation_mode) - ) - if mode_value not in self.handlers: - raise ValueError(f"Invalid disaggregation_mode '{mode_value}'") + if config.disaggregation_mode.value not in self.handlers: + raise ValueError( + f"Invalid disaggregation_mode '{config.disaggregation_mode.value}'" + ) if not config.next_client: if ( diff --git a/examples/tensorrt_llm/utils/trtllm_utils.py b/examples/tensorrt_llm/utils/trtllm_utils.py index 9dbd4f0118..b429a20539 100644 --- a/examples/tensorrt_llm/utils/trtllm_utils.py +++ b/examples/tensorrt_llm/utils/trtllm_utils.py @@ -126,12 +126,14 @@ def cmd_line_args(): "--disaggregation-mode", type=str, default=DEFAULT_DISAGGREGATION_MODE, + choices=[mode.value for mode in DisaggregationMode], help=f"Mode to use for disaggregation. Default: {DEFAULT_DISAGGREGATION_MODE}", ) parser.add_argument( "--disaggregation-strategy", type=str, default=DEFAULT_DISAGGREGATION_STRATEGY, + choices=[strategy.value for strategy in DisaggregationStrategy], help=f"Strategy to use for disaggregation. Default: {DEFAULT_DISAGGREGATION_STRATEGY}", ) parser.add_argument( @@ -142,8 +144,23 @@ def cmd_line_args(): ) args = parser.parse_args() + config = Config() + # Set the model path and served model name. + config.model_path = args.model_path + if args.served_model_name: + config.served_model_name = args.served_model_name + else: + # This becomes an `Option` on the Rust side + config.served_model_name = None + + # Set the disaggregation mode and strategy. + config.disaggregation_mode = DisaggregationMode(args.disaggregation_mode) + config.disaggregation_strategy = DisaggregationStrategy( + args.disaggregation_strategy + ) + # Set the appropriate defaults for the endpoint and next endpoint. - if is_first_worker(args): + if is_first_worker(config): if args.endpoint == "": args.endpoint = DEFAULT_ENDPOINT if ( @@ -156,17 +173,7 @@ def cmd_line_args(): args.endpoint = DEFAULT_NEXT_ENDPOINT if args.next_endpoint != "": raise ValueError("Next endpoint is not allowed for the next worker") - endpoint = args.endpoint - - config = Config() - config.model_path = args.model_path - if args.served_model_name: - config.served_model_name = args.served_model_name - else: - # This becomes an `Option` on the Rust side - config.served_model_name = None - parsed_namespace, parsed_component_name, parsed_endpoint_name = parse_endpoint( endpoint ) @@ -174,12 +181,11 @@ def cmd_line_args(): config.namespace = parsed_namespace config.component = parsed_component_name config.endpoint = parsed_endpoint_name + config.next_endpoint = args.next_endpoint + config.tensor_parallel_size = args.tensor_parallel_size config.kv_block_size = args.kv_block_size config.extra_engine_args = args.extra_engine_args config.publish_events_and_metrics = args.publish_events_and_metrics - config.disaggregation_mode = args.disaggregation_mode - config.disaggregation_strategy = args.disaggregation_strategy - config.next_endpoint = args.next_endpoint return config From 3d9b7edba4380f57583281eeb2a0c354298250b1 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 15:56:18 -0700 Subject: [PATCH 15/22] Address coderabbit --- examples/tensorrt_llm/README.md | 2 +- examples/tensorrt_llm/components/worker.py | 4 ++- .../tensorrt_llm/engine_configs/decode.yaml | 1 - .../deepseek_r1/mtp/mtp_agg.yaml | 2 +- .../deepseek_r1/mtp/mtp_decode.yaml | 2 +- .../deepseek_r1/mtp/mtp_prefill.yaml | 2 +- .../deepseek_r1/simple/agg.yaml | 2 +- .../deepseek_r1/simple/decode.yaml | 2 +- .../deepseek_r1/simple/prefill.yaml | 2 +- .../deepseek_r1/wide_ep/dep16_agg.yaml | 2 +- .../deepseek_r1/wide_ep/eplb.yaml | 2 +- .../deepseek_r1/wide_ep/wide_ep_agg.yaml | 2 +- .../deepseek_r1/wide_ep/wide_ep_decode.yaml | 2 +- .../deepseek_r1/wide_ep/wide_ep_prefill.yaml | 2 +- examples/tensorrt_llm/launch/disagg.sh | 2 ++ examples/tensorrt_llm/launch/disagg_router.sh | 4 +-- .../multinode/start_trtllm_worker.sh | 4 +-- .../utils/request_handlers/handler_base.py | 2 ++ .../utils/request_handlers/handlers.py | 25 +++++-------------- examples/tensorrt_llm/utils/trtllm_utils.py | 2 +- 20 files changed, 30 insertions(+), 38 deletions(-) diff --git a/examples/tensorrt_llm/README.md b/examples/tensorrt_llm/README.md index 2c2b84d201..27973a90c3 100644 --- a/examples/tensorrt_llm/README.md +++ b/examples/tensorrt_llm/README.md @@ -117,7 +117,7 @@ This figure shows an overview of the major components to deploy: ### Single-Node Deployments > [!IMPORTANT] -> Below we provide some simple shell scripts that run the components for each configuration. Each shell script is simply running the `dynamo-run` to start up the ingress and using `python3` to start up the workers. You can easily take each commmand and run them in separate terminals. +> Below we provide some simple shell scripts that run the components for each configuration. Each shell script is simply running the `dynamo-run` to start up the ingress and using `python3` to start up the workers. You can easily take each command and run them in separate terminals. #### Aggregated ```bash diff --git a/examples/tensorrt_llm/components/worker.py b/examples/tensorrt_llm/components/worker.py index c2ac35d528..6a49fd0d85 100644 --- a/examples/tensorrt_llm/components/worker.py +++ b/examples/tensorrt_llm/components/worker.py @@ -8,7 +8,9 @@ import sys # Add the parent directory to the Python path so we can import utils -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) import uvloop from tensorrt_llm import SamplingParams diff --git a/examples/tensorrt_llm/engine_configs/decode.yaml b/examples/tensorrt_llm/engine_configs/decode.yaml index eb943fd6e7..3460f6ff80 100644 --- a/examples/tensorrt_llm/engine_configs/decode.yaml +++ b/examples/tensorrt_llm/engine_configs/decode.yaml @@ -24,4 +24,3 @@ disable_overlap_scheduler: false use_cuda_graph: true kv_cache_config: free_gpu_memory_fraction: 0.95 - diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml index 20944defed..f0b5411221 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml @@ -47,4 +47,4 @@ cuda_graph_batch_sizes: - 128 - 256 print_iter_log: true -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml index bfd4cde207..ab48b2e78b 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml @@ -50,4 +50,4 @@ cuda_graph_batch_sizes: - 128 - 256 print_iter_log: true -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml index 870a3f48d1..ee6ee26a94 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml @@ -34,4 +34,4 @@ disable_overlap_scheduler: true # Enable the MTP(Multi-Token Prediction) in the prefill model engine speculative_config: decoding_type: MTP - num_nextn_predict_layers: 1 \ No newline at end of file + num_nextn_predict_layers: 1 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/agg.yaml index a7f9ce2ced..29dddba56f 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/agg.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/agg.yaml @@ -51,4 +51,4 @@ cuda_graph_batch_sizes: - 128 - 256 print_iter_log: true -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/decode.yaml index 51a5bb2471..772b94b283 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/decode.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/decode.yaml @@ -52,4 +52,4 @@ cuda_graph_batch_sizes: - 128 - 256 print_iter_log: true -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/prefill.yaml index 0221dcbedf..6ae899a68a 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/prefill.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/simple/prefill.yaml @@ -34,4 +34,4 @@ kv_cache_config: disable_overlap_scheduler: true print_iter_log: true # NOTE: This dtype must match in both prefill/decode configs -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml index f02d3388b0..d697caacfa 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml @@ -24,4 +24,4 @@ cuda_graph_batch_sizes: - 64 - 128 - 256 -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/eplb.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/eplb.yaml index 60e85e5fa9..f2fe0a13c6 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/eplb.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/eplb.yaml @@ -4,4 +4,4 @@ # moe_load_balancer settings for TRTLLM based on: # https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer num_slots: 288 -layer_updates_per_iter: 2 \ No newline at end of file +layer_updates_per_iter: 2 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml index d1ceb6b5ab..4f2df0aa56 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml @@ -32,4 +32,4 @@ cuda_graph_batch_sizes: - 64 - 128 - 256 -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml index ad3d5df0bf..a8d1854814 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml @@ -56,4 +56,4 @@ cuda_graph_batch_sizes: - 128 - 256 print_iter_log: true -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml index d750d3a2aa..44e439e506 100644 --- a/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml +++ b/examples/tensorrt_llm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml @@ -38,4 +38,4 @@ kv_cache_config: disable_overlap_scheduler: true print_iter_log: true # NOTE: This dtype must match in both prefill/decode configs -kv_cache_dtype: fp8 \ No newline at end of file +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/launch/disagg.sh b/examples/tensorrt_llm/launch/disagg.sh index 74a1df3b70..a4c3c44167 100755 --- a/examples/tensorrt_llm/launch/disagg.sh +++ b/examples/tensorrt_llm/launch/disagg.sh @@ -32,6 +32,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \ + --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ --disaggregation-mode prefill & PREFILL_PID=$! @@ -40,4 +41,5 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \ + --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ --disaggregation-mode decode \ No newline at end of file diff --git a/examples/tensorrt_llm/launch/disagg_router.sh b/examples/tensorrt_llm/launch/disagg_router.sh index 4fb20dff07..3557804c7b 100755 --- a/examples/tensorrt_llm/launch/disagg_router.sh +++ b/examples/tensorrt_llm/launch/disagg_router.sh @@ -44,7 +44,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py --extra-engine-args "$PREFILL_ENGINE_ARGS" \ --disaggregation-mode prefill \ --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ - $EXTRA_PREFILL_ARGS & + "$EXTRA_PREFILL_ARGS" & PREFILL_PID=$! # run decode worker @@ -54,4 +54,4 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \ --extra-engine-args "$DECODE_ENGINE_ARGS" \ --disaggregation-mode decode \ --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ - $EXTRA_DECODE_ARGS \ No newline at end of file + "$EXTRA_DECODE_ARGS" \ No newline at end of file diff --git a/examples/tensorrt_llm/multinode/start_trtllm_worker.sh b/examples/tensorrt_llm/multinode/start_trtllm_worker.sh index ac7cd1b120..16737c3a6b 100755 --- a/examples/tensorrt_llm/multinode/start_trtllm_worker.sh +++ b/examples/tensorrt_llm/multinode/start_trtllm_worker.sh @@ -31,11 +31,11 @@ fi EXTRA_ARGS="" if [[ -n ${DISAGGREGATION_MODE} ]]; then - EXTRA_ARGS+="--disaggregation-mode ${DISAGGREGATION_MODE}" + EXTRA_ARGS+="--disaggregation-mode ${DISAGGREGATION_MODE} " fi if [[ -n ${DISAGGREGATION_STRATEGY} ]]; then - EXTRA_ARGS+="--disaggregation-strategy ${DISAGGREGATION_STRATEGY}" + EXTRA_ARGS+="--disaggregation-strategy ${DISAGGREGATION_STRATEGY} " fi trtllm-llmapi-launch \ diff --git a/examples/tensorrt_llm/utils/request_handlers/handler_base.py b/examples/tensorrt_llm/utils/request_handlers/handler_base.py index 9dca9823cf..3990f8ad1c 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handler_base.py +++ b/examples/tensorrt_llm/utils/request_handlers/handler_base.py @@ -103,6 +103,8 @@ async def generate_locally(self, request: dict): disaggregated_params = LlmDisaggregatedParams(request_type="context_only") if "disaggregated_params" in request: + if self.disaggregation_mode == DisaggregationMode.PREFILL: + raise ValueError("Cannot provide disaggregated_params in prefill mode") disaggregated_params = DisaggregatedParamsCodec.decode( DisaggregatedParams(**request["disaggregated_params"]) ) diff --git a/examples/tensorrt_llm/utils/request_handlers/handlers.py b/examples/tensorrt_llm/utils/request_handlers/handlers.py index 96ff37f781..72fdc55c57 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handlers.py +++ b/examples/tensorrt_llm/utils/request_handlers/handlers.py @@ -8,24 +8,10 @@ DisaggregationMode, DisaggregationStrategy, HandlerBase, + RequestHandlerConfig, ) -@dataclass -class RequestHandlerConfig: - """ - Configuration for the request handler - """ - - component: object - engine: object - default_sampling_params: object - publisher: object - disaggregation_mode: DisaggregationMode - disaggregation_strategy: DisaggregationStrategy - next_client: object - - class RequestHandlerFactory: def __init__(self): self.handlers = { @@ -144,14 +130,15 @@ async def generate(self, request: dict): response_count += 1 if response_count > 1: raise ValueError("Prefill response should be generated only once.") - + + response_data = prefill_response.data() if prefill_response is not None and self.check_error( - prefill_response.data() + response_data ): - yield prefill_response.data() + yield response_data return if prefill_response is not None: - request["disaggregated_params"] = prefill_response.data()[ + request["disaggregated_params"] = response_data[ "disaggregated_params" ] diff --git a/examples/tensorrt_llm/utils/trtllm_utils.py b/examples/tensorrt_llm/utils/trtllm_utils.py index b429a20539..773e2be50c 100644 --- a/examples/tensorrt_llm/utils/trtllm_utils.py +++ b/examples/tensorrt_llm/utils/trtllm_utils.py @@ -165,7 +165,7 @@ def cmd_line_args(): args.endpoint = DEFAULT_ENDPOINT if ( args.next_endpoint == "" - and args.disaggregation_mode != "prefill_and_decode" + and config.disaggregation_mode != DisaggregationMode.AGGREGATED ): args.next_endpoint = DEFAULT_NEXT_ENDPOINT else: From 74aacbcaa04b86feff43fef2afa6edd3751a9613 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 16:03:21 -0700 Subject: [PATCH 16/22] Fixes --- examples/tensorrt_llm/launch/disagg_router.sh | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/examples/tensorrt_llm/launch/disagg_router.sh b/examples/tensorrt_llm/launch/disagg_router.sh index 3557804c7b..8383846889 100755 --- a/examples/tensorrt_llm/launch/disagg_router.sh +++ b/examples/tensorrt_llm/launch/disagg_router.sh @@ -28,30 +28,29 @@ dynamo run in=http out=dyn --router-mode kv --http-port=8000 & DYNAMO_PID=$! -EXTRA_PREFILL_ARGS="" -EXTRA_DECODE_ARGS="" +EXTRA_PREFILL_ARGS=() +EXTRA_DECODE_ARGS=() if [ "$DISAGGREGATION_STRATEGY" == "prefill_first" ]; then - EXTRA_PREFILL_ARGS="--publish-events-and-metrics" + EXTRA_PREFILL_ARGS+=(--publish-events-and-metrics) else - EXTRA_DECODE_ARGS="--publish-events-and-metrics" + EXTRA_DECODE_ARGS+=(--publish-events-and-metrics) fi - # run prefill worker CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ - --extra-engine-args "$PREFILL_ENGINE_ARGS" \ + --extra-engine-args "$PREFILL_ENGINE_ARGS" \ --disaggregation-mode prefill \ --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ - "$EXTRA_PREFILL_ARGS" & + "${EXTRA_PREFILL_ARGS[@]}" & PREFILL_PID=$! # run decode worker CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ - --extra-engine-args "$DECODE_ENGINE_ARGS" \ + --extra-engine-args "$DECODE_ENGINE_ARGS" \ --disaggregation-mode decode \ --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ - "$EXTRA_DECODE_ARGS" \ No newline at end of file + "${EXTRA_DECODE_ARGS[@]}" \ No newline at end of file From 8221214c4a29ec707aa403c401fca29aebe00d07 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 16:31:58 -0700 Subject: [PATCH 17/22] Format fix --- examples/tensorrt_llm/components/worker.py | 46 ++++++++++++++++--- .../utils/request_handlers/handlers.py | 11 ++--- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/examples/tensorrt_llm/components/worker.py b/examples/tensorrt_llm/components/worker.py index 6a49fd0d85..81f1832121 100644 --- a/examples/tensorrt_llm/components/worker.py +++ b/examples/tensorrt_llm/components/worker.py @@ -7,17 +7,10 @@ import signal import sys -# Add the parent directory to the Python path so we can import utils -parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) -if parent_dir not in sys.path: - sys.path.insert(0, parent_dir) - import uvloop from tensorrt_llm import SamplingParams from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options from tensorrt_llm.llmapi.tokenizer import tokenizer_factory -from utils.request_handlers.handlers import RequestHandlerConfig, RequestHandlerFactory -from utils.trtllm_utils import Config, cmd_line_args, is_first_worker, parse_endpoint from dynamo.llm import ( ModelType, @@ -28,6 +21,45 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime.logging import configure_dynamo_logging + +def _setup_path_and_imports(): + """Setup path and import utils modules""" + # Add the parent directory to the Python path so we can import utils + parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + + from utils.request_handlers.handlers import ( + RequestHandlerConfig, + RequestHandlerFactory, + ) + from utils.trtllm_utils import ( + Config, + cmd_line_args, + is_first_worker, + parse_endpoint, + ) + + return ( + RequestHandlerConfig, + RequestHandlerFactory, + Config, + cmd_line_args, + is_first_worker, + parse_endpoint, + ) + + +# Import utils modules +( + RequestHandlerConfig, + RequestHandlerFactory, + Config, + cmd_line_args, + is_first_worker, + parse_endpoint, +) = _setup_path_and_imports() + # Default buffer size for kv cache events. DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024 diff --git a/examples/tensorrt_llm/utils/request_handlers/handlers.py b/examples/tensorrt_llm/utils/request_handlers/handlers.py index 72fdc55c57..0c546840a6 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handlers.py +++ b/examples/tensorrt_llm/utils/request_handlers/handlers.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import copy -from dataclasses import dataclass from utils.request_handlers.handler_base import ( DisaggregationMode, @@ -130,17 +129,13 @@ async def generate(self, request: dict): response_count += 1 if response_count > 1: raise ValueError("Prefill response should be generated only once.") - + response_data = prefill_response.data() - if prefill_response is not None and self.check_error( - response_data - ): + if prefill_response is not None and self.check_error(response_data): yield response_data return if prefill_response is not None: - request["disaggregated_params"] = response_data[ - "disaggregated_params" - ] + request["disaggregated_params"] = response_data["disaggregated_params"] async for res in self.generate_locally(request): yield res From 9377a2d88660ebfed033df1f56760ae577fc5cdd Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 17:32:30 -0700 Subject: [PATCH 18/22] Update llama4 instructions --- examples/tensorrt_llm/README.md | 52 ++------------ .../llama4/eagle/eagle_agg.yaml | 51 +++++++++++++ .../llama4/eagle/eagle_decode.yaml | 50 +++++++++++++ .../llama4/eagle/eagle_prefill.yaml | 36 ++++++++++ examples/tensorrt_llm/llama4_plus_eagle.md | 72 +++++++++++++++++++ 5 files changed, 214 insertions(+), 47 deletions(-) create mode 100644 examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_agg.yaml create mode 100644 examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_decode.yaml create mode 100644 examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_prefill.yaml create mode 100644 examples/tensorrt_llm/llama4_plus_eagle.md diff --git a/examples/tensorrt_llm/README.md b/examples/tensorrt_llm/README.md index 43ae156719..f32e2d3565 100644 --- a/examples/tensorrt_llm/README.md +++ b/examples/tensorrt_llm/README.md @@ -33,6 +33,8 @@ This directory contains examples and reference implementations for deploying Lar - [Benchmarking](#benchmarking) - [Disaggregation Strategy](#disaggregation-strategy) - [KV Cache Transfer](#kv-cache-transfer-in-disaggregated-serving) +- [More Example Architectures](#more-example-architectures) + - [Llama 4 Maverick Instruct + Eagle Speculative Decoding](./llama4_plus_eagle.md) # Quick Start @@ -172,7 +174,7 @@ Notes: ### Multinode Deployment -For details and instructions on multinode serving, please refer to the [multinode-examples.md](./multinode/multinode-examples.md) document. This guide provides step-by-step examples and configuration tips for deploying Dynamo with TensorRT-LLM across multiple nodes. +For comprehensive instructions on multinode serving, see the [multinode-examples.md](./multinode/multinode-examples.md) guide. It provides step-by-step deployment examples and configuration tips for running Dynamo with TensorRT-LLM across multiple nodes. While the walkthrough uses DeepSeek-R1 as the model, you can easily adapt the process for any supported model by updating the relevant configuration files. You can see [Llama4+eagle](./llama4_plus_eagle.md) guide to see how to use these scripts when a single worker fits on the single node. ### Client @@ -203,50 +205,6 @@ DISAGGREGATION_STRATEGY="prefill_first" ./launch/disagg.sh Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disaggregated serving: UCX (default) and NIXL (experimental). For detailed information and configuration instructions for each method, see the [KV cache transfer guide](./kv-cache-tranfer.md). -### Example architectures for Llama 4 Maverick Instruct + Eagle Speculative Decoding +## More Example Architectures -#### Notes -* Testing for the current example used: - * One GB200x4 node for aggregate serving - * Two GB200x4 nodes for disaggregate serving -* To run Eagle Speculative Decoding with Llama 4, ensure the container meets the following criteria: - * Built with a version of TensorRT-LLM based on the 0.21 release [Link](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.21) - * The TensorRT-LLM build includes the changes from this PR [Link](https://github.com/NVIDIA/TensorRT-LLM/pull/5975) -* If you need to download model weights off huggingface, make sure you run the command `huggingface-cli login` and have access to the necessary gated models. - -##### Aggregated Serving -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.disagg:Frontend -f configs/llama4/eagle/eagle_agg.yaml -``` -* Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team. - -##### Disaggregated Serving - -###### Head Node -Start nats/etcd -``` bash -nats-server -js & -etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0.0.0:2379 --data-dir /tmp/etcd & -``` - -Launch graph of Frontend and TensorRTLLMWorker (decode) on head node: - -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve graphs.agg:Frontend -f configs/llama4/eagle/eagle_disagg.yaml & -``` - -###### Worker Node(s) -Set environment variables pointing at the etcd/nats endpoints on the head node. -```bash -export HEAD_NODE_IP="" -export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" -export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" -``` - -Deploy a Prefill worker: -```bash -cd /workspace/examples/tensorrt_llm -dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f configs/llama4/eagle/eagle_disagg.yaml --service-name TensorRTLLMPrefillWorker & -``` +- [Llama 4 Maverick Instruct + Eagle Speculative Decoding](./llama4_plus_eagle.md) diff --git a/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_agg.yaml b/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_agg.yaml new file mode 100644 index 0000000000..1bed25ef27 --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_agg.yaml @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +backend: pytorch +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +max_batch_size: 256 +# When max_num_tokens set to higher values, can cause OOM issues. +# Will be investigated in the future with TRTLLM team. +max_num_tokens: 1024 +max_seq_len: 8448 +autotuner_enabled: false +disable_overlap_scheduler: true + +# Enable Speculative Decoding in the model engine +speculative_config: + decoding_type: Eagle + max_draft_len: 1 + pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: False + +kv_cache_config: + free_gpu_memory_fraction: 0.5 + enable_block_reuse: false + +use_cuda_graph: true +cuda_graph_padding_enabled: true +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +print_iter_log: true +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_decode.yaml b/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_decode.yaml new file mode 100644 index 0000000000..4b595d2126 --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_decode.yaml @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +backend: pytorch +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +max_batch_size: 256 +max_num_tokens: 512 +# 8704 = 8192 ISL + 512 OSL +max_seq_len: 8704 +disable_overlap_scheduler: true +autotuner_enabled: false + +# Enable Speculative Decoding in the model engine +speculative_config: + decoding_type: Eagle + max_draft_len: 1 + pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: False + +kv_cache_config: + free_gpu_memory_fraction: 0.5 + enable_block_reuse: false + +use_cuda_graph: true +cuda_graph_padding_enabled: true +cuda_graph_batch_sizes: +- 1 +- 2 +- 4 +- 8 +- 16 +- 32 +- 64 +- 128 +- 256 +print_iter_log: true +kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_prefill.yaml b/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_prefill.yaml new file mode 100644 index 0000000000..8442e478ba --- /dev/null +++ b/examples/tensorrt_llm/engine_configs/llama4/eagle/eagle_prefill.yaml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +backend: pytorch +tensor_parallel_size: 4 +moe_expert_parallel_size: 4 +max_batch_size: 1 +max_num_tokens: 8192 +max_seq_len: 8192 +print_iter_log: true +kv_cache_dtype: fp8 +disable_overlap_scheduler: true +autotuner_enabled: false + +# Enable Speculative Decoding in the model engine +speculative_config: + decoding_type: Eagle + max_draft_len: 1 + pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: False + +kv_cache_config: + free_gpu_memory_fraction: 0.5 + enable_block_reuse: false diff --git a/examples/tensorrt_llm/llama4_plus_eagle.md b/examples/tensorrt_llm/llama4_plus_eagle.md new file mode 100644 index 0000000000..00c5aa2034 --- /dev/null +++ b/examples/tensorrt_llm/llama4_plus_eagle.md @@ -0,0 +1,72 @@ + + +# Llama 4 Maverick Instruct with Eagle Speculative Decoding on SLURM + +This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Speculative Decoding on GB200x4 nodes. We will be following the [multi-node deployment instructions](./multinode/multinode-examples.md) to set up the environment for the following scenarios: + +- **Aggregated Serving:** + Deploy the entire Llama 4 model on a single GB200x4 node for end-to-end serving. + +- **Disaggregated Serving:** + Distribute the workload across two GB200x4 nodes: + - One node runs the decode worker. + - The other node runs the prefill worker. + +For advanced control over how requests are routed between prefill and decode workers in disaggregated mode, refer to the [Disaggregation Strategy](./README.md#disaggregation-strategy) section. + +## Notes +* To run Eagle Speculative Decoding with Llama 4, ensure the container meets the following criteria: + * Built with a version of TensorRT-LLM based on the 0.21 release [Link](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.21) + * The TensorRT-LLM build includes the changes from this PR [Link](https://github.com/NVIDIA/TensorRT-LLM/pull/5975) +* If you need to download model weights off huggingface, make sure you run the command `huggingface-cli login` and have access to the necessary gated models. + + +## Setup + +Assuming you have already allocated your nodes via `salloc`, and are +inside an interactive shell on one of the allocated nodes, set the +following environment variables based: + +```bash +export IMAGE="" +export MOUNTS="${PWD}/:/mnt" +export MODEL_PATH="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" +export SERVED_MODEL_NAME="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" +``` + + +## Aggregated Serving +```bash +cd $DYNAMO_ROOT/examples/tensorrt_llm +export NUM_NODES=1 +export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml" +./multinode/srun_aggregated.sh +``` +* Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team. + +## Disaggregated Serving + +```bash +cd $DYNAMO_ROOT/examples/tensorrt_llm +export NUM_PREFILL_NODES=1 +export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_prefill.yaml" +export NUM_DECODE_NODES=1 +export DECODE_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_decode.yaml" +./multinode/srun_disaggregated.sh +``` +* Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team. From 49f6226e377d0c1d936052fa83453de22f4dbe95 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 17:44:37 -0700 Subject: [PATCH 19/22] remove duplicate configs --- .../configs/llama4/eagle/eagle_agg.yaml | 31 ----------- .../configs/llama4/eagle/eagle_disagg.yaml | 44 ---------------- .../eagle/engine_configs/agg_config.yaml | 51 ------------------- .../eagle/engine_configs/decode_config.yaml | 50 ------------------ .../eagle/engine_configs/prefill_config.yaml | 36 ------------- examples/tensorrt_llm/llama4_plus_eagle.md | 8 ++- 6 files changed, 7 insertions(+), 213 deletions(-) delete mode 100644 examples/tensorrt_llm/configs/llama4/eagle/eagle_agg.yaml delete mode 100644 examples/tensorrt_llm/configs/llama4/eagle/eagle_disagg.yaml delete mode 100644 examples/tensorrt_llm/configs/llama4/eagle/engine_configs/agg_config.yaml delete mode 100644 examples/tensorrt_llm/configs/llama4/eagle/engine_configs/decode_config.yaml delete mode 100644 examples/tensorrt_llm/configs/llama4/eagle/engine_configs/prefill_config.yaml diff --git a/examples/tensorrt_llm/configs/llama4/eagle/eagle_agg.yaml b/examples/tensorrt_llm/configs/llama4/eagle/eagle_agg.yaml deleted file mode 100644 index fe4a94df4b..0000000000 --- a/examples/tensorrt_llm/configs/llama4/eagle/eagle_agg.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - # This is the client-facing model name, you can set this to anything you'd like. - served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: round-robin - -TensorRTLLMWorker: - served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" - model-path: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" - extra-engine-args: "configs/llama4/eagle/engine_configs/agg_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 4 diff --git a/examples/tensorrt_llm/configs/llama4/eagle/eagle_disagg.yaml b/examples/tensorrt_llm/configs/llama4/eagle/eagle_disagg.yaml deleted file mode 100644 index 3bfe111fac..0000000000 --- a/examples/tensorrt_llm/configs/llama4/eagle/eagle_disagg.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Frontend: - served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" - endpoint: dynamo.TensorRTLLMWorker.generate - port: 8000 - router: round-robin - -TensorRTLLMWorker: - served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" - model-path: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/llama4/eagle/engine_configs/decode_config.yaml" - router: round-robin - enable-disagg: true - ServiceArgs: - workers: 1 - resources: - gpu: 4 - -TensorRTLLMPrefillWorker: - model-path: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" - # Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine. - # The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields. - extra-engine-args: "configs/llama4/eagle/engine_configs/prefill_config.yaml" - router: round-robin - ServiceArgs: - workers: 1 - resources: - gpu: 4 diff --git a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/agg_config.yaml b/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/agg_config.yaml deleted file mode 100644 index 1bed25ef27..0000000000 --- a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/agg_config.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -backend: pytorch -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -max_batch_size: 256 -# When max_num_tokens set to higher values, can cause OOM issues. -# Will be investigated in the future with TRTLLM team. -max_num_tokens: 1024 -max_seq_len: 8448 -autotuner_enabled: false -disable_overlap_scheduler: true - -# Enable Speculative Decoding in the model engine -speculative_config: - decoding_type: Eagle - max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False - -kv_cache_config: - free_gpu_memory_fraction: 0.5 - enable_block_reuse: false - -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/decode_config.yaml b/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/decode_config.yaml deleted file mode 100644 index 4b595d2126..0000000000 --- a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/decode_config.yaml +++ /dev/null @@ -1,50 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -backend: pytorch -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -max_batch_size: 256 -max_num_tokens: 512 -# 8704 = 8192 ISL + 512 OSL -max_seq_len: 8704 -disable_overlap_scheduler: true -autotuner_enabled: false - -# Enable Speculative Decoding in the model engine -speculative_config: - decoding_type: Eagle - max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False - -kv_cache_config: - free_gpu_memory_fraction: 0.5 - enable_block_reuse: false - -use_cuda_graph: true -cuda_graph_padding_enabled: true -cuda_graph_batch_sizes: -- 1 -- 2 -- 4 -- 8 -- 16 -- 32 -- 64 -- 128 -- 256 -print_iter_log: true -kv_cache_dtype: fp8 diff --git a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/prefill_config.yaml b/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/prefill_config.yaml deleted file mode 100644 index 8442e478ba..0000000000 --- a/examples/tensorrt_llm/configs/llama4/eagle/engine_configs/prefill_config.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -backend: pytorch -tensor_parallel_size: 4 -moe_expert_parallel_size: 4 -max_batch_size: 1 -max_num_tokens: 8192 -max_seq_len: 8192 -print_iter_log: true -kv_cache_dtype: fp8 -disable_overlap_scheduler: true -autotuner_enabled: false - -# Enable Speculative Decoding in the model engine -speculative_config: - decoding_type: Eagle - max_draft_len: 1 - pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 - eagle3_one_model: False - -kv_cache_config: - free_gpu_memory_fraction: 0.5 - enable_block_reuse: false diff --git a/examples/tensorrt_llm/llama4_plus_eagle.md b/examples/tensorrt_llm/llama4_plus_eagle.md index 00c5aa2034..fbbe23d085 100644 --- a/examples/tensorrt_llm/llama4_plus_eagle.md +++ b/examples/tensorrt_llm/llama4_plus_eagle.md @@ -43,6 +43,8 @@ inside an interactive shell on one of the allocated nodes, set the following environment variables based: ```bash +cd $DYNAMO_ROOT/examples/tensorrt_llm + export IMAGE="" export MOUNTS="${PWD}/:/mnt" export MODEL_PATH="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" @@ -52,7 +54,6 @@ export SERVED_MODEL_NAME="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" ## Aggregated Serving ```bash -cd $DYNAMO_ROOT/examples/tensorrt_llm export NUM_NODES=1 export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml" ./multinode/srun_aggregated.sh @@ -70,3 +71,8 @@ export DECODE_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_decode.yaml" ./multinode/srun_disaggregated.sh ``` * Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team. + + +## Example Request + +See [here](./multinode/multinode-examples.md#example-request) to learn how to send a request to the deployment. From 405a4c8de30ea7ba3b426bde00546b79c0df9afb Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 17:54:10 -0700 Subject: [PATCH 20/22] Fix documentation --- examples/tensorrt_llm/llama4_plus_eagle.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/tensorrt_llm/llama4_plus_eagle.md b/examples/tensorrt_llm/llama4_plus_eagle.md index fbbe23d085..0200dfce19 100644 --- a/examples/tensorrt_llm/llama4_plus_eagle.md +++ b/examples/tensorrt_llm/llama4_plus_eagle.md @@ -19,10 +19,10 @@ limitations under the License. This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Speculative Decoding on GB200x4 nodes. We will be following the [multi-node deployment instructions](./multinode/multinode-examples.md) to set up the environment for the following scenarios: -- **Aggregated Serving:** +- **Aggregated Serving:** Deploy the entire Llama 4 model on a single GB200x4 node for end-to-end serving. -- **Disaggregated Serving:** +- **Disaggregated Serving:** Distribute the workload across two GB200x4 nodes: - One node runs the decode worker. - The other node runs the prefill worker. @@ -46,11 +46,14 @@ following environment variables based: cd $DYNAMO_ROOT/examples/tensorrt_llm export IMAGE="" +# export MOUNTS="${PWD}/:/mnt,/lustre:/lustre" export MOUNTS="${PWD}/:/mnt" export MODEL_PATH="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" export SERVED_MODEL_NAME="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" ``` +See [this](./multinode/multinode-examples.md#setup) section from multinode guide to learn more about the above options. + ## Aggregated Serving ```bash @@ -63,7 +66,6 @@ export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml" ## Disaggregated Serving ```bash -cd $DYNAMO_ROOT/examples/tensorrt_llm export NUM_PREFILL_NODES=1 export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_prefill.yaml" export NUM_DECODE_NODES=1 From ea0e8a5f2d23173e76470b5025d332d976b6d53c Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 18:00:04 -0700 Subject: [PATCH 21/22] Fix --- examples/tensorrt_llm/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tensorrt_llm/README.md b/examples/tensorrt_llm/README.md index f32e2d3565..016b37de29 100644 --- a/examples/tensorrt_llm/README.md +++ b/examples/tensorrt_llm/README.md @@ -174,7 +174,7 @@ Notes: ### Multinode Deployment -For comprehensive instructions on multinode serving, see the [multinode-examples.md](./multinode/multinode-examples.md) guide. It provides step-by-step deployment examples and configuration tips for running Dynamo with TensorRT-LLM across multiple nodes. While the walkthrough uses DeepSeek-R1 as the model, you can easily adapt the process for any supported model by updating the relevant configuration files. You can see [Llama4+eagle](./llama4_plus_eagle.md) guide to see how to use these scripts when a single worker fits on the single node. +For comprehensive instructions on multinode serving, see the [multinode-examples.md](./multinode/multinode-examples.md) guide. It provides step-by-step deployment examples and configuration tips for running Dynamo with TensorRT-LLM across multiple nodes. While the walkthrough uses DeepSeek-R1 as the model, you can easily adapt the process for any supported model by updating the relevant configuration files. You can see [Llama4+eagle](./llama4_plus_eagle.md) guide to learn how to use these scripts when a single worker fits on the single node. ### Client From da4a132349d637936318438f498115b47981cd2e Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Mon, 14 Jul 2025 19:37:48 -0700 Subject: [PATCH 22/22] mypy fixes --- examples/tensorrt_llm/components/worker.py | 4 +++ .../utils/request_handlers/handlers.py | 6 +++-- examples/tensorrt_llm/utils/trtllm_utils.py | 27 ++++++++++--------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/examples/tensorrt_llm/components/worker.py b/examples/tensorrt_llm/components/worker.py index 81f1832121..3c8015c4fb 100644 --- a/examples/tensorrt_llm/components/worker.py +++ b/examples/tensorrt_llm/components/worker.py @@ -6,6 +6,7 @@ import os import signal import sys +from typing import TYPE_CHECKING import uvloop from tensorrt_llm import SamplingParams @@ -21,6 +22,9 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime.logging import configure_dynamo_logging +if TYPE_CHECKING: + from utils.trtllm_utils import Config + def _setup_path_and_imports(): """Setup path and import utils modules""" diff --git a/examples/tensorrt_llm/utils/request_handlers/handlers.py b/examples/tensorrt_llm/utils/request_handlers/handlers.py index 0c546840a6..1a47112ba4 100644 --- a/examples/tensorrt_llm/utils/request_handlers/handlers.py +++ b/examples/tensorrt_llm/utils/request_handlers/handlers.py @@ -130,11 +130,13 @@ async def generate(self, request: dict): if response_count > 1: raise ValueError("Prefill response should be generated only once.") - response_data = prefill_response.data() + response_data = ( + prefill_response.data() if prefill_response is not None else None + ) if prefill_response is not None and self.check_error(response_data): yield response_data return - if prefill_response is not None: + if prefill_response is not None and response_data is not None: request["disaggregated_params"] = response_data["disaggregated_params"] async for res in self.generate_locally(request): diff --git a/examples/tensorrt_llm/utils/trtllm_utils.py b/examples/tensorrt_llm/utils/trtllm_utils.py index 773e2be50c..a7aafb242b 100644 --- a/examples/tensorrt_llm/utils/trtllm_utils.py +++ b/examples/tensorrt_llm/utils/trtllm_utils.py @@ -20,18 +20,21 @@ class Config: """Command line parameters or defaults""" - namespace: str - component: str - endpoint: str - model_path: str - served_model_name: Optional[str] = None - tensor_parallel_size: int - kv_block_size: int - extra_engine_args: str - publish_events_and_metrics: bool - disaggregation_mode: DisaggregationMode - disaggregation_strategy: DisaggregationStrategy - next_endpoint: str + def __init__(self) -> None: + self.namespace: str = "" + self.component: str = "" + self.endpoint: str = "" + self.model_path: str = "" + self.served_model_name: Optional[str] = None + self.tensor_parallel_size: int = 1 + self.kv_block_size: int = 32 + self.extra_engine_args: str = "" + self.publish_events_and_metrics: bool = False + self.disaggregation_mode: DisaggregationMode = DEFAULT_DISAGGREGATION_MODE + self.disaggregation_strategy: DisaggregationStrategy = ( + DEFAULT_DISAGGREGATION_STRATEGY + ) + self.next_endpoint: str = "" def __str__(self) -> str: return (