ai-dynamo · alec-flowers · Sep 26, 2025 · Sep 23, 2025 · Sep 24, 2025 · Sep 24, 2025
@@ -12,23 +12,22 @@ ARG RELEASE_BUILD
 ARG ENABLE_KVBM=false
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
 ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
+ARG CUDA_VERSION="12.8"
 
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="1da94e673c257373280026f75ceb4effac80e892"  # from v0.10.1.1
+ARG VLLM_REF="v0.10.2"
+# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
+ARG FLASHINF_REF="v0.3.0"
 ARG TORCH_BACKEND="cu128"
 
+# If left blank, then we will fallback to vLLM defaults
+ARG DEEPGEMM_REF=""
+
 # sccache configuration - inherit from base build
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET=""
 ARG SCCACHE_REGION=""
 
-# Match 0.10.1.1 vLLM release
-# https://github.com/vllm-project/vllm/releases/tag/v0.10.1.1
-# Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100:
-# "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'"
-ARG DEEPGEMM_REF="f85ec64"
-ARG FLASHINF_REF="v0.2.11"
-
 # Define general architecture ARGs for supporting both x86 and aarch64 builds.
 #   ARCH: Used for package suffixes (e.g., amd64, arm64)
 #   ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
@@ -108,6 +107,7 @@ ARG VLLM_GIT_URL
 ARG DEEPGEMM_REF
 ARG FLASHINF_REF
 ARG TORCH_BACKEND
+ARG CUDA_VERSION
 
 ARG MAX_JOBS=16
 ENV MAX_JOBS=$MAX_JOBS
@@ -138,18 +138,15 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
     --mount=type=cache,target=/root/.cache/uv \
     --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
     --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
-        # TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
-        # Should be able to select how you want your build to go
         cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
         chmod +x /tmp/install_vllm.sh && \
-        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND && \
+        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
         /tmp/use-sccache.sh show-stats "vLLM";
 
 ENV LD_LIBRARY_PATH=\
 /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
 $LD_LIBRARY_PATH
 
-
 ##################################################
 ########## Runtime Image ########################
 ##################################################
@@ -362,4 +359,4 @@ RUN uv pip install maturin[patchelf]
 ENV PYTHONPATH=${WORKSPACE_DIR}/components/metrics/src:${WORKSPACE_DIR}/components/frontend/src:${WORKSPACE_DIR}/components/planner/src:${WORKSPACE_DIR}/components/backends/mocker/src:${WORKSPACE_DIR}/components/backends/trtllm/src:${WORKSPACE_DIR}/components/backends/vllm/src:${WORKSPACE_DIR}/components/backends/sglang/src:${WORKSPACE_DIR}/components/backends/llama_cpp/src
 
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-CMD []
+CMD []
@@ -1,44 +1,35 @@
 #!/usr/bin/env bash
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Install vllm and wideEP kernels from a specific git reference
+
+# This script is used to install vLLM and its dependencies
+# If installing vLLM from a release tag, we will use pip to manage the install
+# Otherwise, we will use git to checkout the vLLM source code and build it from source.
+# The dependencies are installed in the following order:
+# 1. vLLM
+# 2. LMCache
+# 3. DeepGEMM
+# 4. EP kernels
 
 set -euo pipefail
 
-# Parse arguments
-EDITABLE=true
-# REMOVE nvshmem cherry-pick when moving to next version of vllm
-VLLM_REF="1da94e673c257373280026f75ceb4effac80e892"  # from v0.10.1.1
-# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
-# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
-VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.1.1-cp38-abi3-manylinux1_x86_64.whl"
-VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
+VLLM_REF="v0.10.2"
+
+# Basic Configurations
+ARCH=$(uname -m)
 MAX_JOBS=16
 INSTALLATION_DIR=/tmp
-ARCH=$(uname -m)
-DEEPGEMM_REF="f85ec64"
-FLASHINF_REF="v0.2.11"
+
+# VLLM and Dependency Configurations
 TORCH_BACKEND="cu128"
+TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
+DEEPGEMM_REF=""
+CUDA_VERSION="12.8" # For DEEPGEMM
 
-# Convert x86_64 to amd64 for consistency with Docker ARG
-if [ "$ARCH" = "x86_64" ]; then
-    ARCH="amd64"
-elif [ "$ARCH" = "aarch64" ]; then
-    ARCH="arm64"
-fi
+# These flags are applicable when installing vLLM from source code
+EDITABLE=true
+VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
+FLASHINF_REF="v0.3.0"
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -82,8 +73,16 @@ while [[ $# -gt 0 ]]; do
             TORCH_BACKEND="$2"
             shift 2
             ;;
+        --torch-cuda-arch-list)
+            TORCH_CUDA_ARCH_LIST="$2"
+            shift 2
+            ;;
+        --cuda-version)
+            CUDA_VERSION="$2"
+            shift 2
+            ;;
         -h|--help)
-            echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND]"
+            echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
             echo "Options:"
             echo "  --editable        Install vllm in editable mode (default)"
             echo "  --no-editable     Install vllm in non-editable mode"
@@ -94,6 +93,8 @@ while [[ $# -gt 0 ]]; do
             echo "  --deepgemm-ref REF  Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
             echo "  --flashinf-ref REF  Git reference for Flash Infer (default: ${FLASHINF_REF})"
             echo "  --torch-backend BACKEND  Torch backend to use (default: ${TORCH_BACKEND})"
+            echo "  --torch-cuda-arch-list LIST  CUDA architectures to compile for (default: ${TORCH_CUDA_ARCH_LIST})"
+            echo "  --cuda-version VERSION  CUDA version to use (default: ${CUDA_VERSION})"
             exit 0
             ;;
         *)
@@ -103,105 +104,143 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
+# Convert x86_64 to amd64 for consistency with Docker ARG
+if [ "$ARCH" = "x86_64" ]; then
+    ARCH="amd64"
+elif [ "$ARCH" = "aarch64" ]; then
+    ARCH="arm64"
+fi
+
 export MAX_JOBS=$MAX_JOBS
 export CUDA_HOME=/usr/local/cuda
 
-echo "Installing vllm with the following configuration:"
-echo "  EDITABLE: $EDITABLE"
-echo "  VLLM_REF: $VLLM_REF"
-echo "  MAX_JOBS: $MAX_JOBS"
-echo "  ARCH: $ARCH"
-echo "  TORCH_BACKEND: $TORCH_BACKEND"
-
-# Install common dependencies
+echo "=== Installing prerequisites ==="
 uv pip install pip cuda-python
 
-if [ "$ARCH" = "amd64" ]; then
-    # LMCache installation currently fails on arm64 due to CUDA dependency issues:
-    # OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
-    # TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
-    uv pip install lmcache==0.3.3
-fi
+echo "\n=== Configuration Summary ==="
+echo "  VLLM_REF=$VLLM_REF | EDITABLE=$EDITABLE | ARCH=$ARCH"
+echo "  MAX_JOBS=$MAX_JOBS | TORCH_BACKEND=$TORCH_BACKEND | CUDA_VERSION=$CUDA_VERSION"
+echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
+echo "  DEEPGEMM_REF=$DEEPGEMM_REF | FLASHINF_REF=$FLASHINF_REF"
+echo "  INSTALLATION_DIR=$INSTALLATION_DIR | VLLM_GIT_URL=$VLLM_GIT_URL"
 
-# Create vllm directory and clone
-mkdir -p $INSTALLATION_DIR
+echo "\n=== Cloning vLLM repository ==="
+# We need to clone to install dependencies
 cd $INSTALLATION_DIR
 git clone $VLLM_GIT_URL vllm
 cd vllm
 git checkout $VLLM_REF
-# nvshmem fix - cherry-pick commit pinning pplx version
-# https://github.com/ai-dynamo/dynamo/actions/runs/17907241473/job/50910654042?pr=2969#step:8:280
-# remove when moving to next version of vllm
-# Configure git user for cherry-pick operation
-GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 906e461ed6ddccd3cc7b68fa72048d2d3fcbd72c
-
-if [ "$ARCH" = "arm64" ]; then
-    echo "Installing vllm for ARM64 architecture"
-
-    # Try to install specific PyTorch version first, fallback to latest nightly
-    echo "Attempting to install pinned PyTorch nightly versions..."
-    if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl; then
-        echo "Pinned versions failed"
-        exit 1
-        # uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
-    fi
 
-    python use_existing_torch.py
-    uv pip install -r requirements/build.txt
+# TODO remove in future vLLM release, re-instate ignore torch script
+# https://github.com/vllm-project/vllm/pull/24729
+GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
+
+
+echo "\n=== Installing vLLM & FlashInfer ==="
+
+if [[ $VLLM_REF =~ ^v ]] && [ "$ARCH" = "amd64" ]; then
+    # VLLM_REF starts with 'v' and amd64 - use pip install with version tag
+    echo "Installing vLLM $VLLM_REF from PyPI..."
+
+    uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND
 
-    if [ "$EDITABLE" = "true" ]; then
-        MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -e . -v
-    else
-        MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation . -v
-    fi
 else
-    echo "Installing vllm for AMD64 architecture"
+    # VLLM_REF does not start with 'v' or amd64 - use git checkout path
+    if [ "$ARCH" = "arm64" ]; then
 
-    echo "Attempting to install pinned OpenAI version..."
-    if ! uv pip install  openai==1.99.9; then
-        echo "Pinned versions failed"
-        exit 1
-    fi
+        # torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
+        # nightly can be unstable so we will not use it here
+        # for now we will use torch 2.7.1+cu128 but this requires a recompilation from source
+
+        echo "Building vLLM from source for ARM64 architecture..."
 
-    export VLLM_PRECOMPILED_WHEEL_LOCATION="${VLLM_PRECOMPILED_WHEEL_LOCATION}"
+        # Try to install specific PyTorch version first
+        echo "Attempting to install pinned PyTorch nightly versions..."
+        if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128; then
+            echo "Pinned versions failed"
+            exit 1
+        fi
+
+        # Create constraints file to pin all PyTorch-related versions
+        echo "Creating constraints file to preserve PyTorch ecosystem versions..."
+        TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
+        TORCHAUDIO_VERSION=$(python -c "import torchaudio; print(torchaudio.__version__)")
+        TORCHVISION_VERSION=$(python -c "import torchvision; print(torchvision.__version__)")
+
+        rm -rf /tmp/torch_constraints.txt
+        echo "torch==$TORCH_VERSION" > /tmp/torch_constraints.txt
+        echo "torchaudio==$TORCHAUDIO_VERSION" >> /tmp/torch_constraints.txt
+        echo "torchvision==$TORCHVISION_VERSION" >> /tmp/torch_constraints.txt
+
+        echo "Pinned versions:"
+        echo "  - torch==$TORCH_VERSION"
+        echo "  - torchaudio==$TORCHAUDIO_VERSION"
+        echo "  - torchvision==$TORCHVISION_VERSION"
+
+        python use_existing_torch.py
+        uv pip install -c /tmp/torch_constraints.txt -r requirements/build.txt
+
+        if [ "$EDITABLE" = "true" ]; then
+            MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt -e . -v
+        else
+            MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt . -v
+        fi
+
+        echo "\n=== Installing FlashInfer from source ==="
+        cd $INSTALLATION_DIR
+        git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
+        cd flashinfer
+        git checkout $FLASHINF_REF
+
+        # Install with constraints to prevent PyTorch upgrade
+        uv pip install -v --no-build-isolation -c /tmp/torch_constraints.txt .
 
-    if [ "$EDITABLE" = "true" ]; then
-	uv pip install -e . --torch-backend=$TORCH_BACKEND
     else
-        uv pip install . --torch-backend=$TORCH_BACKEND
+        echo "Building vLLM from source for AMD64 architecture..."
+
+        # When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
+        # aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
+        export VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"
+
+        if [ "$EDITABLE" = "true" ]; then
+            uv pip install -e . --torch-backend=$TORCH_BACKEND
+        else
+            uv pip install . --torch-backend=$TORCH_BACKEND
+        fi
+
+        echo "\n=== Installing FlashInfer from PyPI ==="
+        uv pip install flashinfer-python==$FLASHINF_REF
+
     fi
 fi
 
-# Install ep_kernels and DeepGEMM
-echo "Installing ep_kernels and DeepGEMM"
-cd tools/ep_kernels
-TORCH_CUDA_ARCH_LIST="9.0;10.0" bash install_python_libraries.sh # These libraries aren't pinned.
-cd ep_kernels_workspace
-git clone https://github.com/deepseek-ai/DeepGEMM.git
-cd DeepGEMM
-git checkout $DEEPGEMM_REF # Pin Version
-
-sed -i 's|git@github.com:|https://github.com/|g' .gitmodules
-git submodule sync --recursive
-git submodule update --init --recursive
+echo "✓ vLLM installation completed"
 
-# command for 03d0be3
-python setup.py install
+echo "\n=== Installing LMCache ==="
+if [ "$ARCH" = "amd64" ]; then
+    # LMCache installation currently fails on arm64 due to CUDA dependency issues:
+    # OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
+    # TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
 
-# new install command for post 03d0be3
-# cat install.sh
-# ./install.sh
+    # Alec: Likely lmcache was compiled witha different version of torch and need to install it from source for arm64
+    uv pip install lmcache==0.3.3
+    echo "✓ LMCache installed"
+else
+    echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
+fi
 
+echo "\n=== Installing DeepGEMM ==="
+cd $INSTALLATION_DIR/vllm/tools
 
-# Install Flash Infer
-if [ "$ARCH" = "arm64" ]; then
-    uv pip install flashinfer-python
+if [ -n "$DEEPGEMM_REF" ]; then
+    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
 else
-    cd $INSTALLATION_DIR
-    git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
-    cd flashinfer
-    git checkout $FLASHINF_REF
-    uv pip install -v --no-build-isolation .
+    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
 fi
+echo "✓ DeepGEMM installation completed"
+
+echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
+cd ep_kernels/
+TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
 
-echo "vllm installation completed successfully"
+echo "\n✅ All installations completed successfully!"
@@ -162,7 +162,6 @@ async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResu
             documents=request.documents,
             chat_template_kwargs=request.chat_template_kwargs,
             tool_parser=self.openai_serving.tool_parser,
-            truncate_prompt_tokens=request.truncate_prompt_tokens,
             add_special_tokens=request.add_special_tokens,
         )
 
@@ -288,7 +287,6 @@ async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
             request,
             self.tokenizer,
             input_or_inputs=request.prompt,
-            truncate_prompt_tokens=request.truncate_prompt_tokens,
             add_special_tokens=request.add_special_tokens,
         )