Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 10 additions & 13 deletions container/Dockerfile.vllm
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,22 @@ ARG RELEASE_BUILD
ARG ENABLE_KVBM=false
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.8"

# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="1da94e673c257373280026f75ceb4effac80e892" # from v0.10.1.1
ARG VLLM_REF="v0.10.2"
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF="v0.3.0"
ARG TORCH_BACKEND="cu128"

# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""

# sccache configuration - inherit from base build
ARG USE_SCCACHE
ARG SCCACHE_BUCKET=""
ARG SCCACHE_REGION=""

# Match 0.10.1.1 vLLM release
# https://github.com/vllm-project/vllm/releases/tag/v0.10.1.1
# Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100:
# "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'"
ARG DEEPGEMM_REF="f85ec64"
ARG FLASHINF_REF="v0.2.11"

# Define general architecture ARGs for supporting both x86 and aarch64 builds.
# ARCH: Used for package suffixes (e.g., amd64, arm64)
# ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
Expand Down Expand Up @@ -108,6 +107,7 @@ ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG TORCH_BACKEND
ARG CUDA_VERSION

ARG MAX_JOBS=16
ENV MAX_JOBS=$MAX_JOBS
Expand Down Expand Up @@ -138,18 +138,15 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
# TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
# Should be able to select how you want your build to go
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND && \
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
/tmp/use-sccache.sh show-stats "vLLM";

ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
$LD_LIBRARY_PATH


##################################################
########## Runtime Image ########################
##################################################
Expand Down Expand Up @@ -362,4 +359,4 @@ RUN uv pip install maturin[patchelf]
ENV PYTHONPATH=${WORKSPACE_DIR}/components/metrics/src:${WORKSPACE_DIR}/components/frontend/src:${WORKSPACE_DIR}/components/planner/src:${WORKSPACE_DIR}/components/backends/mocker/src:${WORKSPACE_DIR}/components/backends/trtllm/src:${WORKSPACE_DIR}/components/backends/vllm/src:${WORKSPACE_DIR}/components/backends/sglang/src:${WORKSPACE_DIR}/components/backends/llama_cpp/src

ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
CMD []
253 changes: 146 additions & 107 deletions container/deps/vllm/install_vllm.sh
Original file line number Diff line number Diff line change
@@ -1,44 +1,35 @@
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Install vllm and wideEP kernels from a specific git reference

# This script is used to install vLLM and its dependencies
# If installing vLLM from a release tag, we will use pip to manage the install
# Otherwise, we will use git to checkout the vLLM source code and build it from source.
# The dependencies are installed in the following order:
# 1. vLLM
# 2. LMCache
# 3. DeepGEMM
# 4. EP kernels

set -euo pipefail

# Parse arguments
EDITABLE=true
# REMOVE nvshmem cherry-pick when moving to next version of vllm
VLLM_REF="1da94e673c257373280026f75ceb4effac80e892" # from v0.10.1.1
# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.1.1-cp38-abi3-manylinux1_x86_64.whl"
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
VLLM_REF="v0.10.2"

# Basic Configurations
ARCH=$(uname -m)
MAX_JOBS=16
INSTALLATION_DIR=/tmp
ARCH=$(uname -m)
DEEPGEMM_REF="f85ec64"
FLASHINF_REF="v0.2.11"

# VLLM and Dependency Configurations
TORCH_BACKEND="cu128"
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
DEEPGEMM_REF=""
CUDA_VERSION="12.8" # For DEEPGEMM

# Convert x86_64 to amd64 for consistency with Docker ARG
if [ "$ARCH" = "x86_64" ]; then
ARCH="amd64"
elif [ "$ARCH" = "aarch64" ]; then
ARCH="arm64"
fi
# These flags are applicable when installing vLLM from source code
EDITABLE=true
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
FLASHINF_REF="v0.3.0"

while [[ $# -gt 0 ]]; do
case $1 in
Expand Down Expand Up @@ -82,8 +73,16 @@ while [[ $# -gt 0 ]]; do
TORCH_BACKEND="$2"
shift 2
;;
--torch-cuda-arch-list)
TORCH_CUDA_ARCH_LIST="$2"
shift 2
;;
--cuda-version)
CUDA_VERSION="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND]"
echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Options:"
echo " --editable Install vllm in editable mode (default)"
echo " --no-editable Install vllm in non-editable mode"
Expand All @@ -94,6 +93,8 @@ while [[ $# -gt 0 ]]; do
echo " --deepgemm-ref REF Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
echo " --flashinf-ref REF Git reference for Flash Infer (default: ${FLASHINF_REF})"
echo " --torch-backend BACKEND Torch backend to use (default: ${TORCH_BACKEND})"
echo " --torch-cuda-arch-list LIST CUDA architectures to compile for (default: ${TORCH_CUDA_ARCH_LIST})"
echo " --cuda-version VERSION CUDA version to use (default: ${CUDA_VERSION})"
exit 0
;;
*)
Expand All @@ -103,105 +104,143 @@ while [[ $# -gt 0 ]]; do
esac
done

# Convert x86_64 to amd64 for consistency with Docker ARG
if [ "$ARCH" = "x86_64" ]; then
ARCH="amd64"
elif [ "$ARCH" = "aarch64" ]; then
ARCH="arm64"
fi

export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda

echo "Installing vllm with the following configuration:"
echo " EDITABLE: $EDITABLE"
echo " VLLM_REF: $VLLM_REF"
echo " MAX_JOBS: $MAX_JOBS"
echo " ARCH: $ARCH"
echo " TORCH_BACKEND: $TORCH_BACKEND"

# Install common dependencies
echo "=== Installing prerequisites ==="
uv pip install pip cuda-python

if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues:
# OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
# TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
uv pip install lmcache==0.3.3
fi
echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | EDITABLE=$EDITABLE | ARCH=$ARCH"
echo " MAX_JOBS=$MAX_JOBS | TORCH_BACKEND=$TORCH_BACKEND | CUDA_VERSION=$CUDA_VERSION"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
echo " DEEPGEMM_REF=$DEEPGEMM_REF | FLASHINF_REF=$FLASHINF_REF"
echo " INSTALLATION_DIR=$INSTALLATION_DIR | VLLM_GIT_URL=$VLLM_GIT_URL"

# Create vllm directory and clone
mkdir -p $INSTALLATION_DIR
echo "\n=== Cloning vLLM repository ==="
# We need to clone to install dependencies
cd $INSTALLATION_DIR
git clone $VLLM_GIT_URL vllm
cd vllm
git checkout $VLLM_REF
# nvshmem fix - cherry-pick commit pinning pplx version
# https://github.com/ai-dynamo/dynamo/actions/runs/17907241473/job/50910654042?pr=2969#step:8:280
# remove when moving to next version of vllm
# Configure git user for cherry-pick operation
GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 906e461ed6ddccd3cc7b68fa72048d2d3fcbd72c

if [ "$ARCH" = "arm64" ]; then
echo "Installing vllm for ARM64 architecture"

# Try to install specific PyTorch version first, fallback to latest nightly
echo "Attempting to install pinned PyTorch nightly versions..."
if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl; then
echo "Pinned versions failed"
exit 1
# uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
fi

python use_existing_torch.py
uv pip install -r requirements/build.txt
# TODO remove in future vLLM release, re-instate ignore torch script
# https://github.com/vllm-project/vllm/pull/24729
GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064


echo "\n=== Installing vLLM & FlashInfer ==="

if [[ $VLLM_REF =~ ^v ]] && [ "$ARCH" = "amd64" ]; then
# VLLM_REF starts with 'v' and amd64 - use pip install with version tag
echo "Installing vLLM $VLLM_REF from PyPI..."

uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND

if [ "$EDITABLE" = "true" ]; then
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -e . -v
else
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation . -v
fi
else
echo "Installing vllm for AMD64 architecture"
# VLLM_REF does not start with 'v' or amd64 - use git checkout path
if [ "$ARCH" = "arm64" ]; then

echo "Attempting to install pinned OpenAI version..."
if ! uv pip install openai==1.99.9; then
echo "Pinned versions failed"
exit 1
fi
# torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
# nightly can be unstable so we will not use it here
# for now we will use torch 2.7.1+cu128 but this requires a recompilation from source

echo "Building vLLM from source for ARM64 architecture..."

export VLLM_PRECOMPILED_WHEEL_LOCATION="${VLLM_PRECOMPILED_WHEEL_LOCATION}"
# Try to install specific PyTorch version first
echo "Attempting to install pinned PyTorch nightly versions..."
if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128; then
echo "Pinned versions failed"
exit 1
fi

# Create constraints file to pin all PyTorch-related versions
echo "Creating constraints file to preserve PyTorch ecosystem versions..."
TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
TORCHAUDIO_VERSION=$(python -c "import torchaudio; print(torchaudio.__version__)")
TORCHVISION_VERSION=$(python -c "import torchvision; print(torchvision.__version__)")

rm -rf /tmp/torch_constraints.txt
echo "torch==$TORCH_VERSION" > /tmp/torch_constraints.txt
echo "torchaudio==$TORCHAUDIO_VERSION" >> /tmp/torch_constraints.txt
echo "torchvision==$TORCHVISION_VERSION" >> /tmp/torch_constraints.txt

echo "Pinned versions:"
echo " - torch==$TORCH_VERSION"
echo " - torchaudio==$TORCHAUDIO_VERSION"
echo " - torchvision==$TORCHVISION_VERSION"

python use_existing_torch.py
uv pip install -c /tmp/torch_constraints.txt -r requirements/build.txt

if [ "$EDITABLE" = "true" ]; then
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt -e . -v
else
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt . -v
fi

echo "\n=== Installing FlashInfer from source ==="
cd $INSTALLATION_DIR
git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
cd flashinfer
git checkout $FLASHINF_REF

# Install with constraints to prevent PyTorch upgrade
uv pip install -v --no-build-isolation -c /tmp/torch_constraints.txt .

if [ "$EDITABLE" = "true" ]; then
uv pip install -e . --torch-backend=$TORCH_BACKEND
else
uv pip install . --torch-backend=$TORCH_BACKEND
echo "Building vLLM from source for AMD64 architecture..."

# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
export VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"

if [ "$EDITABLE" = "true" ]; then
uv pip install -e . --torch-backend=$TORCH_BACKEND
else
uv pip install . --torch-backend=$TORCH_BACKEND
fi

echo "\n=== Installing FlashInfer from PyPI ==="
uv pip install flashinfer-python==$FLASHINF_REF

fi
fi

# Install ep_kernels and DeepGEMM
echo "Installing ep_kernels and DeepGEMM"
cd tools/ep_kernels
TORCH_CUDA_ARCH_LIST="9.0;10.0" bash install_python_libraries.sh # These libraries aren't pinned.
cd ep_kernels_workspace
git clone https://github.com/deepseek-ai/DeepGEMM.git
cd DeepGEMM
git checkout $DEEPGEMM_REF # Pin Version

sed -i 's|git@github.com:|https://github.com/|g' .gitmodules
git submodule sync --recursive
git submodule update --init --recursive
echo "✓ vLLM installation completed"

# command for 03d0be3
python setup.py install
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues:
# OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
# TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.

# new install command for post 03d0be3
# cat install.sh
# ./install.sh
# Alec: Likely lmcache was compiled witha different version of torch and need to install it from source for arm64
uv pip install lmcache==0.3.3
echo "✓ LMCache installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi

echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools

# Install Flash Infer
if [ "$ARCH" = "arm64" ]; then
uv pip install flashinfer-python
if [ -n "$DEEPGEMM_REF" ]; then
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
else
cd $INSTALLATION_DIR
git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
cd flashinfer
git checkout $FLASHINF_REF
uv pip install -v --no-build-isolation .
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
fi
echo "✓ DeepGEMM installation completed"

echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh

echo "vllm installation completed successfully"
echo "\n✅ All installations completed successfully!"
2 changes: 0 additions & 2 deletions examples/multimodal/utils/chat_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResu
documents=request.documents,
chat_template_kwargs=request.chat_template_kwargs,
tool_parser=self.openai_serving.tool_parser,
truncate_prompt_tokens=request.truncate_prompt_tokens,
add_special_tokens=request.add_special_tokens,
)

Expand Down Expand Up @@ -288,7 +287,6 @@ async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
request,
self.tokenizer,
input_or_inputs=request.prompt,
truncate_prompt_tokens=request.truncate_prompt_tokens,
add_special_tokens=request.add_special_tokens,
)

Expand Down
Loading
Loading