feat: Using NIXL for KV cache transfer when using disaggregated serving in TRTLLM (#1591)

tanmayv25 · rmccorm4 · web-flow · commit 0b7cdf5532ee · 2025-06-24T12:29:44.000-07:00
Signed-off-by: Tanmay Verma &lt;tanmay2592@gmail.com&gt;
Co-authored-by: Ryan McCormick &lt;rmccormick@nvidia.com&gt;
diff --git a/container/Dockerfile.tensorrt_llm b/container/Dockerfile.tensorrt_llm
@@ -324,16 +324,29 @@ RUN pip install dist/ai_dynamo_runtime*cp312*.whl  && \
 
 ENV DYNAMO_HOME=/workspace
 
+ARG ARCH_ALT
+ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:$LD_LIBRARY_PATH
+
 # Use UCX for TRTLLM KV Cache Transfer
-ENV TRTLLM_USE_UCX_KVCACHE=1
+ARG TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL
 
+# Create a script that sets the environment variables and source it
+RUN echo '#!/bin/bash' > /usr/local/bin/set_trtllm_env.sh && \
+    if [ "$TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL" = "1" ]; then \
+        echo 'export TRTLLM_USE_NIXL_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
+    else \
+        echo 'export TRTLLM_USE_UCX_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
+    fi && \
+    chmod +x /usr/local/bin/set_trtllm_env.sh
+
+# Source the script in bashrc
+RUN echo 'source /usr/local/bin/set_trtllm_env.sh' >> /root/.bashrc
 
 # Copy launch banner
 RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
     sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
     echo "cat ~/.launch_screen" >> ~/.bashrc
 
-
 # FIXME: May want a modification with dynamo banner on entry
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
 CMD []
diff --git a/container/build.sh b/container/build.sh
@@ -90,6 +90,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # variables to learn how to run a pipeline with a specific commit.
 DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3"
 TRTLLM_COMMIT=""
+TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 
 # TensorRT-LLM PyPI index URL
 TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
@@ -166,6 +167,13 @@ get_options() {
             fi
             USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT=true
             ;;
+        --trtllm-use-nixl-kvcache-experimental)
+            if [ -n "$2" ] && [[ "$2" != --* ]]; then
+                echo "ERROR: --trtllm-use-nixl-kvcache-experimental does not take any argument"
+                exit 1
+            fi
+            TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="1"
+            ;;
         --tensorrtllm-pip-wheel)
             if [ "$2" ]; then
                 TENSORRTLLM_PIP_WHEEL=$2
@@ -364,6 +372,7 @@ show_help() {
     echo "  [--build-context name=path to add build context]"
     echo "  [--release-build perform a release build]"
     echo "  [--make-efa Enables EFA support for NIXL]"
+    echo "  [--trtllm-use-nixl-kvcache-experimental Enables NIXL KVCACHE experimental support for TensorRT-LLM]"
     exit 0
 }
 
@@ -492,6 +501,10 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
         TRTLLM_COMMIT="$DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT"
     fi
 
+    if [ -n "${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL}" ]; then
+        BUILD_ARGS+=" --build-arg TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL=${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL} "
+    fi
+
     # If user didn't set both wheel and commit, use default tensorrt_llm pip wheel
     if [ -z "$TENSORRTLLM_PIP_WHEEL" ] && [ -z "$TRTLLM_COMMIT" ]; then
         TENSORRTLLM_PIP_WHEEL="$DEFAULT_TENSORRTLLM_PIP_WHEEL"
@@ -507,7 +520,7 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
         echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}"
         if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}" "${ARCH}_${TRTLLM_COMMIT}"; then
             echo "WARN: Valid trtllm wheel file not found in ${TENSORRTLLM_PIP_WHEEL_DIR}, attempting to build from source"
-            if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH}; then
+            if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH} -n ${NIXL_COMMIT}; then
                 error "ERROR: Failed to build TensorRT-LLM wheel"
             fi
         fi
diff --git a/container/build_trtllm_wheel.sh b/container/build_trtllm_wheel.sh
@@ -18,15 +18,17 @@
 
 # This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
 
-while getopts "c:o:a:" opt; do
+while getopts "c:o:a:n:" opt; do
   case ${opt} in
     c) TRTLLM_COMMIT=$OPTARG ;;
     o) OUTPUT_DIR=$OPTARG ;;
     a) ARCH=$OPTARG ;;
-    *) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch]"
+    n) NIXL_COMMIT=$OPTARG ;;
+    *) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch] [-n nixl_commit]"
        echo "  -c: TensorRT-LLM commit to build"
        echo "  -o: Output directory for wheel files"
        echo "  -a: Architecture (amd64 or arm64)"
+       echo "  -n: NIXL commit"
        exit 1 ;;
   esac
 done
@@ -36,6 +38,8 @@ if [ -z "$OUTPUT_DIR" ]; then
     OUTPUT_DIR="/tmp/trtllm_wheel"
 fi
 
+# Store directory where script is being launched from
+MAIN_DIR=$(dirname "$(readlink -f "$0")")
 
 (cd /tmp && \
 # Clone the TensorRT-LLM repository.
@@ -79,8 +83,16 @@ sed -i "s/__version__ = \"\(.*\)\"/__version__ = \"\1+dev${COMMIT_VERSION}\"/" "
 echo "Updated version:"
 grep "__version__" "$VERSION_FILE"
 
+echo "Copying install_nixl.sh from $MAIN_DIR to ${PWD}/docker/common/"
+# Copy install_nixl.sh to docker/common/
+cp $MAIN_DIR/deps/tensorrt_llm/install_nixl.sh docker/common/install_nixl.sh
+# Update NIXL_COMMIT in install_nixl.sh to use the parameter passed to this script
+sed -i "s/NIXL_COMMIT=\"[^\"]*\"/NIXL_COMMIT=\"${NIXL_COMMIT}\"/" docker/common/install_nixl.sh
 
-make -C docker wheel_build
+
+# Need to build in the Triton Devel Image for NIXL support.
+make -C docker tritondevel_build
+make -C docker wheel_build DEVEL_IMAGE=tritondevel BUILD_WHEEL_OPTS='--extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl'
 
 # Copy the wheel to the host
 mkdir -p $OUTPUT_DIR
diff --git a/container/deps/tensorrt_llm/install_nixl.sh b/container/deps/tensorrt_llm/install_nixl.sh
@@ -0,0 +1,80 @@
+#!/bin/bash -e
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Install NIXL for TensorRT-LLM.
+# This script is an adapted version of the NIXL install script from the TensorRT-LLM repository.
+# The original script is located at:
+# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/common/install_nixl.sh
+
+set -ex
+
+GITHUB_URL="https://github.com"
+
+UCX_VERSION="v1.18.1"
+UCX_INSTALL_PATH="/usr/local/ucx/"
+CUDA_PATH="/usr/local/cuda"
+
+NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3"
+
+UCX_REPO="https://github.com/openucx/ucx.git"
+NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
+
+
+
+
+if [ ! -d ${UCX_INSTALL_PATH} ]; then
+  git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
+  cd ucx
+  ./autogen.sh
+  ./contrib/configure-release       \
+    --prefix=${UCX_INSTALL_PATH}    \
+    --enable-shared                 \
+    --disable-static                \
+    --disable-doxygen-doc           \
+    --enable-optimizations          \
+    --enable-cma                    \
+    --enable-devel-headers          \
+    --with-cuda=${CUDA_PATH}        \
+    --with-verbs                    \
+    --with-dm                       \
+    --enable-mt
+  make install -j$(nproc)
+  cd ..
+  rm -rf ucx  # Remove UCX source to save space
+  echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
+fi
+
+ARCH_NAME="x86_64-linux-gnu"
+if [ "$(uname -m)" != "amd64" ] && [ "$(uname -m)" != "x86_64" ]; then
+  ARCH_NAME="aarch64-linux-gnu"
+  EXTRA_NIXL_ARGS="-Ddisable_gds_backend=true"
+fi
+
+if [ $ARCH_NAME != "x86_64-linux-gnu" ]; then
+  echo "The NIXL backend is temporarily unavailable on the aarch64 platform. Exiting script."
+  exit 0
+fi
+
+pip3 install --no-cache-dir meson ninja pybind11
+git clone ${NIXL_REPO} nixl
+cd nixl
+git checkout ${NIXL_COMMIT}
+meson setup builddir -Ducx_path=${UCX_INSTALL_PATH}  -Dstatic_plugins=UCX  -Dbuildtype=release ${EXTRA_NIXL_ARGS}
+cd builddir && ninja install
+cd ../..
+rm -rf nixl*  # Remove NIXL source tree to save space
+
+echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"
diff --git a/examples/tensorrt_llm/README.md b/examples/tensorrt_llm/README.md
@@ -69,15 +69,6 @@ apt-get update && apt-get -y install git git-lfs
 ./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit
 ```
 
-> [!NOTE]
-> Because of a known issue of C++11 ABI compatibility within the NGC pytorch container,
-> we rebuild TensorRT-LLM from source. See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
-> for more information.
->
-> Hence, when running this script for the first time, the time taken by this script can be
-> quite long.
-
-
 ### Run container
 
 ```
@@ -306,13 +297,54 @@ See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) secti
 To benchmark your deployment with GenAI-Perf, see this utility script, configuring the
 `model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh)
 
-### Future Work
 
-Remaining tasks:
-- [x] Add support for the disaggregated serving.
-- [x] Add multi-node support.
-- [x] Add instructions for benchmarking.
-- [x] Use processor from dynamo-llm framework.
-- [ ] Add integration test coverage.
-- [ ] Merge the code base with llm example to reduce the code duplication.
-- [ ] Enable NIXL integration with TensorRT-LLM once available. Currently, TensorRT-LLM uses UCX to transfer KV cache.
+### KV Cache Transfer for Disaggregated Serving
+
+In disaggregated serving architectures, KV cache must be transferred between prefill and decode nodes. TensorRT-LLM supports two methods for this transfer:
+
+#### Default Method: UCX
+By default, TensorRT-LLM uses UCX (Unified Communication X) for KV cache transfer between prefill and decode nodes. UCX provides high-performance communication optimized for GPU-to-GPU transfers.
+
+#### Experimental Method: NIXL
+TensorRT-LLM also provides experimental support for using **NIXL** (NVIDIA Inference Xfer Library) for KV cache transfer. [NIXL](https://github.com/ai-dynamo/nixl) is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments.
+
+**Note:** NIXL support in TensorRT-LLM is experimental and is not suitable for production environments yet.
+
+#### Using NIXL for KV Cache Transfer
+
+To enable NIXL for KV cache transfer in disaggregated serving:
+
+1. **Build the container with NIXL support:**
+   The TensorRT-LLM wheel must be built from source with NIXL support. The `./container/build.sh` script caches previously built TensorRT-LLM wheels to reduce build time. If you have previously built a TensorRT-LLM wheel without NIXL support, you must delete the cached wheel to force a rebuild with NIXL support.
+
+   **Remove cached TensorRT-LLM wheel (only if previously built without NIXL support):**
+   ```bash
+   rm -rf /tmp/trtllm_wheel
+   ```
+
+   **Build the container with NIXL support:**
+   ```bash
+   ./container/build.sh --framework tensorrtllm \
+     --use-default-experimental-tensorrtllm-commit \
+     --trtllm-use-nixl-kvcache-experimental
+   ```
+
+   **Note:** Both `--use-default-experimental-tensorrtllm-commit` and `--trtllm-use-nixl-kvcache-experimental` flags are required to enable NIXL support.
+
+2. **Run the containerized environment:**
+   See [run container](#run-container) section to learn how to start the container image built in previous step.
+
+3. **Start the disaggregated service:**
+   See [disaggregated serving](#disaggregated-serving) to see how to start the deployment.
+
+4. **Send the request:**
+   See [client](#client) section to learn how to send the request to deployment.
+
+**Important:** Ensure that ETCD and NATS services are running before starting the service.
+
+The container will automatically configure the appropriate environment variables (`TRTLLM_USE_NIXL_KVCACHE=1`) when built with the NIXL flag. The same container image can be used to use UCX for KV cache transfer.
+```bash
+unset TRTLLM_USE_NIXL_KVCACHE
+export TRTLLM_USE_UCX_KVCACHE=1
+```
+