Skip to content

Commit 0b7cdf5

Browse files
tanmayv25rmccorm4
andauthored
feat: Using NIXL for KV cache transfer when using disaggregated serving in TRTLLM (#1591)
Signed-off-by: Tanmay Verma <tanmay2592@gmail.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
1 parent 0c9ae4d commit 0b7cdf5

File tree

5 files changed

+174
-24
lines changed

5 files changed

+174
-24
lines changed

container/Dockerfile.tensorrt_llm

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,16 +324,29 @@ RUN pip install dist/ai_dynamo_runtime*cp312*.whl && \
324324

325325
ENV DYNAMO_HOME=/workspace
326326

327+
ARG ARCH_ALT
328+
ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:$LD_LIBRARY_PATH
329+
327330
# Use UCX for TRTLLM KV Cache Transfer
328-
ENV TRTLLM_USE_UCX_KVCACHE=1
331+
ARG TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL
329332

333+
# Create a script that sets the environment variables and source it
334+
RUN echo '#!/bin/bash' > /usr/local/bin/set_trtllm_env.sh && \
335+
if [ "$TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL" = "1" ]; then \
336+
echo 'export TRTLLM_USE_NIXL_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
337+
else \
338+
echo 'export TRTLLM_USE_UCX_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
339+
fi && \
340+
chmod +x /usr/local/bin/set_trtllm_env.sh
341+
342+
# Source the script in bashrc
343+
RUN echo 'source /usr/local/bin/set_trtllm_env.sh' >> /root/.bashrc
330344

331345
# Copy launch banner
332346
RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
333347
sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
334348
echo "cat ~/.launch_screen" >> ~/.bashrc
335349

336-
337350
# FIXME: May want a modification with dynamo banner on entry
338351
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
339352
CMD []

container/build.sh

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
9090
# variables to learn how to run a pipeline with a specific commit.
9191
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3"
9292
TRTLLM_COMMIT=""
93+
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
9394

9495
# TensorRT-LLM PyPI index URL
9596
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
@@ -166,6 +167,13 @@ get_options() {
166167
fi
167168
USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT=true
168169
;;
170+
--trtllm-use-nixl-kvcache-experimental)
171+
if [ -n "$2" ] && [[ "$2" != --* ]]; then
172+
echo "ERROR: --trtllm-use-nixl-kvcache-experimental does not take any argument"
173+
exit 1
174+
fi
175+
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="1"
176+
;;
169177
--tensorrtllm-pip-wheel)
170178
if [ "$2" ]; then
171179
TENSORRTLLM_PIP_WHEEL=$2
@@ -364,6 +372,7 @@ show_help() {
364372
echo " [--build-context name=path to add build context]"
365373
echo " [--release-build perform a release build]"
366374
echo " [--make-efa Enables EFA support for NIXL]"
375+
echo " [--trtllm-use-nixl-kvcache-experimental Enables NIXL KVCACHE experimental support for TensorRT-LLM]"
367376
exit 0
368377
}
369378

@@ -492,6 +501,10 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
492501
TRTLLM_COMMIT="$DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT"
493502
fi
494503

504+
if [ -n "${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL}" ]; then
505+
BUILD_ARGS+=" --build-arg TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL=${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL} "
506+
fi
507+
495508
# If user didn't set both wheel and commit, use default tensorrt_llm pip wheel
496509
if [ -z "$TENSORRTLLM_PIP_WHEEL" ] && [ -z "$TRTLLM_COMMIT" ]; then
497510
TENSORRTLLM_PIP_WHEEL="$DEFAULT_TENSORRTLLM_PIP_WHEEL"
@@ -507,7 +520,7 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
507520
echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}"
508521
if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}" "${ARCH}_${TRTLLM_COMMIT}"; then
509522
echo "WARN: Valid trtllm wheel file not found in ${TENSORRTLLM_PIP_WHEEL_DIR}, attempting to build from source"
510-
if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH}; then
523+
if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH} -n ${NIXL_COMMIT}; then
511524
error "ERROR: Failed to build TensorRT-LLM wheel"
512525
fi
513526
fi

container/build_trtllm_wheel.sh

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,17 @@
1818

1919
# This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
2020

21-
while getopts "c:o:a:" opt; do
21+
while getopts "c:o:a:n:" opt; do
2222
case ${opt} in
2323
c) TRTLLM_COMMIT=$OPTARG ;;
2424
o) OUTPUT_DIR=$OPTARG ;;
2525
a) ARCH=$OPTARG ;;
26-
*) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch]"
26+
n) NIXL_COMMIT=$OPTARG ;;
27+
*) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch] [-n nixl_commit]"
2728
echo " -c: TensorRT-LLM commit to build"
2829
echo " -o: Output directory for wheel files"
2930
echo " -a: Architecture (amd64 or arm64)"
31+
echo " -n: NIXL commit"
3032
exit 1 ;;
3133
esac
3234
done
@@ -36,6 +38,8 @@ if [ -z "$OUTPUT_DIR" ]; then
3638
OUTPUT_DIR="/tmp/trtllm_wheel"
3739
fi
3840

41+
# Store directory where script is being launched from
42+
MAIN_DIR=$(dirname "$(readlink -f "$0")")
3943

4044
(cd /tmp && \
4145
# Clone the TensorRT-LLM repository.
@@ -79,8 +83,16 @@ sed -i "s/__version__ = \"\(.*\)\"/__version__ = \"\1+dev${COMMIT_VERSION}\"/" "
7983
echo "Updated version:"
8084
grep "__version__" "$VERSION_FILE"
8185

86+
echo "Copying install_nixl.sh from $MAIN_DIR to ${PWD}/docker/common/"
87+
# Copy install_nixl.sh to docker/common/
88+
cp $MAIN_DIR/deps/tensorrt_llm/install_nixl.sh docker/common/install_nixl.sh
89+
# Update NIXL_COMMIT in install_nixl.sh to use the parameter passed to this script
90+
sed -i "s/NIXL_COMMIT=\"[^\"]*\"/NIXL_COMMIT=\"${NIXL_COMMIT}\"/" docker/common/install_nixl.sh
8291

83-
make -C docker wheel_build
92+
93+
# Need to build in the Triton Devel Image for NIXL support.
94+
make -C docker tritondevel_build
95+
make -C docker wheel_build DEVEL_IMAGE=tritondevel BUILD_WHEEL_OPTS='--extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl'
8496

8597
# Copy the wheel to the host
8698
mkdir -p $OUTPUT_DIR
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/bin/bash -e
2+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Install NIXL for TensorRT-LLM.
18+
# This script is an adapted version of the NIXL install script from the TensorRT-LLM repository.
19+
# The original script is located at:
20+
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/common/install_nixl.sh
21+
22+
set -ex
23+
24+
GITHUB_URL="https://github.com"
25+
26+
UCX_VERSION="v1.18.1"
27+
UCX_INSTALL_PATH="/usr/local/ucx/"
28+
CUDA_PATH="/usr/local/cuda"
29+
30+
NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3"
31+
32+
UCX_REPO="https://github.com/openucx/ucx.git"
33+
NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
34+
35+
36+
37+
38+
if [ ! -d ${UCX_INSTALL_PATH} ]; then
39+
git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
40+
cd ucx
41+
./autogen.sh
42+
./contrib/configure-release \
43+
--prefix=${UCX_INSTALL_PATH} \
44+
--enable-shared \
45+
--disable-static \
46+
--disable-doxygen-doc \
47+
--enable-optimizations \
48+
--enable-cma \
49+
--enable-devel-headers \
50+
--with-cuda=${CUDA_PATH} \
51+
--with-verbs \
52+
--with-dm \
53+
--enable-mt
54+
make install -j$(nproc)
55+
cd ..
56+
rm -rf ucx # Remove UCX source to save space
57+
echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
58+
fi
59+
60+
ARCH_NAME="x86_64-linux-gnu"
61+
if [ "$(uname -m)" != "amd64" ] && [ "$(uname -m)" != "x86_64" ]; then
62+
ARCH_NAME="aarch64-linux-gnu"
63+
EXTRA_NIXL_ARGS="-Ddisable_gds_backend=true"
64+
fi
65+
66+
if [ $ARCH_NAME != "x86_64-linux-gnu" ]; then
67+
echo "The NIXL backend is temporarily unavailable on the aarch64 platform. Exiting script."
68+
exit 0
69+
fi
70+
71+
pip3 install --no-cache-dir meson ninja pybind11
72+
git clone ${NIXL_REPO} nixl
73+
cd nixl
74+
git checkout ${NIXL_COMMIT}
75+
meson setup builddir -Ducx_path=${UCX_INSTALL_PATH} -Dstatic_plugins=UCX -Dbuildtype=release ${EXTRA_NIXL_ARGS}
76+
cd builddir && ninja install
77+
cd ../..
78+
rm -rf nixl* # Remove NIXL source tree to save space
79+
80+
echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"

examples/tensorrt_llm/README.md

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,6 @@ apt-get update && apt-get -y install git git-lfs
6969
./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit
7070
```
7171

72-
> [!NOTE]
73-
> Because of a known issue of C++11 ABI compatibility within the NGC pytorch container,
74-
> we rebuild TensorRT-LLM from source. See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
75-
> for more information.
76-
>
77-
> Hence, when running this script for the first time, the time taken by this script can be
78-
> quite long.
79-
80-
8172
### Run container
8273

8374
```
@@ -306,13 +297,54 @@ See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) secti
306297
To benchmark your deployment with GenAI-Perf, see this utility script, configuring the
307298
`model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh)
308299

309-
### Future Work
310300

311-
Remaining tasks:
312-
- [x] Add support for the disaggregated serving.
313-
- [x] Add multi-node support.
314-
- [x] Add instructions for benchmarking.
315-
- [x] Use processor from dynamo-llm framework.
316-
- [ ] Add integration test coverage.
317-
- [ ] Merge the code base with llm example to reduce the code duplication.
318-
- [ ] Enable NIXL integration with TensorRT-LLM once available. Currently, TensorRT-LLM uses UCX to transfer KV cache.
301+
### KV Cache Transfer for Disaggregated Serving
302+
303+
In disaggregated serving architectures, KV cache must be transferred between prefill and decode nodes. TensorRT-LLM supports two methods for this transfer:
304+
305+
#### Default Method: UCX
306+
By default, TensorRT-LLM uses UCX (Unified Communication X) for KV cache transfer between prefill and decode nodes. UCX provides high-performance communication optimized for GPU-to-GPU transfers.
307+
308+
#### Experimental Method: NIXL
309+
TensorRT-LLM also provides experimental support for using **NIXL** (NVIDIA Inference Xfer Library) for KV cache transfer. [NIXL](https://github.com/ai-dynamo/nixl) is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments.
310+
311+
**Note:** NIXL support in TensorRT-LLM is experimental and is not suitable for production environments yet.
312+
313+
#### Using NIXL for KV Cache Transfer
314+
315+
To enable NIXL for KV cache transfer in disaggregated serving:
316+
317+
1. **Build the container with NIXL support:**
318+
The TensorRT-LLM wheel must be built from source with NIXL support. The `./container/build.sh` script caches previously built TensorRT-LLM wheels to reduce build time. If you have previously built a TensorRT-LLM wheel without NIXL support, you must delete the cached wheel to force a rebuild with NIXL support.
319+
320+
**Remove cached TensorRT-LLM wheel (only if previously built without NIXL support):**
321+
```bash
322+
rm -rf /tmp/trtllm_wheel
323+
```
324+
325+
**Build the container with NIXL support:**
326+
```bash
327+
./container/build.sh --framework tensorrtllm \
328+
--use-default-experimental-tensorrtllm-commit \
329+
--trtllm-use-nixl-kvcache-experimental
330+
```
331+
332+
**Note:** Both `--use-default-experimental-tensorrtllm-commit` and `--trtllm-use-nixl-kvcache-experimental` flags are required to enable NIXL support.
333+
334+
2. **Run the containerized environment:**
335+
See [run container](#run-container) section to learn how to start the container image built in previous step.
336+
337+
3. **Start the disaggregated service:**
338+
See [disaggregated serving](#disaggregated-serving) to see how to start the deployment.
339+
340+
4. **Send the request:**
341+
See [client](#client) section to learn how to send the request to deployment.
342+
343+
**Important:** Ensure that ETCD and NATS services are running before starting the service.
344+
345+
The container will automatically configure the appropriate environment variables (`TRTLLM_USE_NIXL_KVCACHE=1`) when built with the NIXL flag. The same container image can be used to use UCX for KV cache transfer.
346+
```bash
347+
unset TRTLLM_USE_NIXL_KVCACHE
348+
export TRTLLM_USE_UCX_KVCACHE=1
349+
```
350+

0 commit comments

Comments
 (0)