Skip to content

Commit 359b093

Browse files
committed
bubmp vllm
Signed-off-by: Alec <aflowers@nvidia.com>
1 parent c433447 commit 359b093

File tree

3 files changed

+125
-122
lines changed

3 files changed

+125
-122
lines changed

container/Dockerfile.vllm

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,22 @@ ARG RELEASE_BUILD
1212
ARG ENABLE_KVBM=false
1313
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
1414
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
15+
ARG CUDA_VERSION="12.8"
1516

1617
# Make sure to update the dependency version in pyproject.toml when updating this
17-
ARG VLLM_REF="1da94e673c257373280026f75ceb4effac80e892" # from v0.10.1.1
18+
ARG VLLM_REF="v0.10.2"
19+
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v'
20+
ARG FLASHINF_REF=""
1821
ARG TORCH_BACKEND="cu128"
1922

23+
# If left blank, then we will fallback to vLLM defaults
24+
ARG DEEPGEMM_REF=""
25+
2026
# sccache configuration - inherit from base build
2127
ARG USE_SCCACHE
2228
ARG SCCACHE_BUCKET=""
2329
ARG SCCACHE_REGION=""
2430

25-
# Match 0.10.1.1 vLLM release
26-
# https://github.com/vllm-project/vllm/releases/tag/v0.10.1.1
27-
# Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100:
28-
# "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'"
29-
ARG DEEPGEMM_REF="f85ec64"
30-
ARG FLASHINF_REF="v0.2.11"
31-
3231
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
3332
# ARCH: Used for package suffixes (e.g., amd64, arm64)
3433
# ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
@@ -108,6 +107,7 @@ ARG VLLM_GIT_URL
108107
ARG DEEPGEMM_REF
109108
ARG FLASHINF_REF
110109
ARG TORCH_BACKEND
110+
ARG CUDA_VERSION
111111

112112
ARG MAX_JOBS=16
113113
ENV MAX_JOBS=$MAX_JOBS
@@ -138,18 +138,15 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
138138
--mount=type=cache,target=/root/.cache/uv \
139139
--mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
140140
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
141-
# TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
142-
# Should be able to select how you want your build to go
143141
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
144142
chmod +x /tmp/install_vllm.sh && \
145-
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND && \
143+
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
146144
/tmp/use-sccache.sh show-stats "vLLM";
147145

148146
ENV LD_LIBRARY_PATH=\
149147
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
150148
$LD_LIBRARY_PATH
151149

152-
153150
##################################################
154151
########## Runtime Image ########################
155152
##################################################
@@ -362,4 +359,4 @@ RUN uv pip install maturin[patchelf]
362359
ENV PYTHONPATH=${WORKSPACE_DIR}/components/metrics/src:${WORKSPACE_DIR}/components/frontend/src:${WORKSPACE_DIR}/components/planner/src:${WORKSPACE_DIR}/components/backends/mocker/src:${WORKSPACE_DIR}/components/backends/trtllm/src:${WORKSPACE_DIR}/components/backends/vllm/src:${WORKSPACE_DIR}/components/backends/sglang/src:${WORKSPACE_DIR}/components/backends/llama_cpp/src
363360

364361
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
365-
CMD []
362+
CMD []
Lines changed: 114 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,35 @@
11
#!/usr/bin/env bash
22
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
4-
#
5-
# Licensed under the Apache License, Version 2.0 (the "License");
6-
# you may not use this file except in compliance with the License.
7-
# You may obtain a copy of the License at
8-
#
9-
# http://www.apache.org/licenses/LICENSE-2.0
10-
#
11-
# Unless required by applicable law or agreed to in writing, software
12-
# distributed under the License is distributed on an "AS IS" BASIS,
13-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14-
# See the License for the specific language governing permissions and
15-
# limitations under the License.
16-
17-
# Install vllm and wideEP kernels from a specific git reference
4+
5+
# This script is used to install vLLM and its dependencies
6+
# If installing vLLM from a release tag, we will use pip to manage the install
7+
# Otherwise, we will use git to checkout the vLLM source code and build it from source.
8+
# The dependencies are installed in the following order:
9+
# 1. vLLM
10+
# 2. LMCache
11+
# 3. DeepGEMM
12+
# 5. EP kernels
1813

1914
set -euo pipefail
2015

21-
# Parse arguments
22-
EDITABLE=true
23-
# REMOVE nvshmem cherry-pick when moving to next version of vllm
24-
VLLM_REF="1da94e673c257373280026f75ceb4effac80e892" # from v0.10.1.1
25-
# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
26-
# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
27-
VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.1.1-cp38-abi3-manylinux1_x86_64.whl"
28-
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
16+
VLLM_REF="v0.10.2"
17+
18+
# Basic Configurations
19+
ARCH=$(uname -m)
2920
MAX_JOBS=16
3021
INSTALLATION_DIR=/tmp
31-
ARCH=$(uname -m)
32-
DEEPGEMM_REF="f85ec64"
33-
FLASHINF_REF="v0.2.11"
22+
23+
# VLLM and Dependency Configurations
3424
TORCH_BACKEND="cu128"
25+
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
26+
DEEPGEMM_REF=""
27+
CUDA_VERSION="12.8" # For DEEPGEMM
3528

36-
# Convert x86_64 to amd64 for consistency with Docker ARG
37-
if [ "$ARCH" = "x86_64" ]; then
38-
ARCH="amd64"
39-
elif [ "$ARCH" = "aarch64" ]; then
40-
ARCH="arm64"
41-
fi
29+
# These flags are applicable when installing vLLM from source code
30+
EDITABLE=true
31+
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
32+
FLASHINF_REF="v0.3.0"
4233

4334
while [[ $# -gt 0 ]]; do
4435
case $1 in
@@ -82,8 +73,16 @@ while [[ $# -gt 0 ]]; do
8273
TORCH_BACKEND="$2"
8374
shift 2
8475
;;
76+
--torch-cuda-arch-list)
77+
TORCH_CUDA_ARCH_LIST="$2"
78+
shift 2
79+
;;
80+
--cuda-version)
81+
CUDA_VERSION="$2"
82+
shift 2
83+
;;
8584
-h|--help)
86-
echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND]"
85+
echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
8786
echo "Options:"
8887
echo " --editable Install vllm in editable mode (default)"
8988
echo " --no-editable Install vllm in non-editable mode"
@@ -94,6 +93,8 @@ while [[ $# -gt 0 ]]; do
9493
echo " --deepgemm-ref REF Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
9594
echo " --flashinf-ref REF Git reference for Flash Infer (default: ${FLASHINF_REF})"
9695
echo " --torch-backend BACKEND Torch backend to use (default: ${TORCH_BACKEND})"
96+
echo " --torch-cuda-arch-list LIST CUDA architectures to compile for (default: ${TORCH_CUDA_ARCH_LIST})"
97+
echo " --cuda-version VERSION CUDA version to use (default: ${CUDA_VERSION})"
9798
exit 0
9899
;;
99100
*)
@@ -103,105 +104,110 @@ while [[ $# -gt 0 ]]; do
103104
esac
104105
done
105106

107+
# Convert x86_64 to amd64 for consistency with Docker ARG
108+
if [ "$ARCH" = "x86_64" ]; then
109+
ARCH="amd64"
110+
elif [ "$ARCH" = "aarch64" ]; then
111+
ARCH="arm64"
112+
fi
113+
106114
export MAX_JOBS=$MAX_JOBS
107115
export CUDA_HOME=/usr/local/cuda
108116

109-
echo "Installing vllm with the following configuration:"
110-
echo " EDITABLE: $EDITABLE"
111-
echo " VLLM_REF: $VLLM_REF"
112-
echo " MAX_JOBS: $MAX_JOBS"
113-
echo " ARCH: $ARCH"
114-
echo " TORCH_BACKEND: $TORCH_BACKEND"
115-
116-
# Install common dependencies
117+
echo "=== Installing prerequisites ==="
117118
uv pip install pip cuda-python
118119

119-
if [ "$ARCH" = "amd64" ]; then
120-
# LMCache installation currently fails on arm64 due to CUDA dependency issues:
121-
# OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
122-
# TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
123-
uv pip install lmcache==0.3.3
124-
fi
120+
echo "\n=== Configuration Summary ==="
121+
echo " VLLM_REF=$VLLM_REF | EDITABLE=$EDITABLE | ARCH=$ARCH"
122+
echo " MAX_JOBS=$MAX_JOBS | TORCH_BACKEND=$TORCH_BACKEND | CUDA_VERSION=$CUDA_VERSION"
123+
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
124+
echo " DEEPGEMM_REF=$DEEPGEMM_REF | FLASHINF_REF=$FLASHINF_REF"
125+
echo " INSTALLATION_DIR=$INSTALLATION_DIR | VLLM_GIT_URL=$VLLM_GIT_URL"
125126

126-
# Create vllm directory and clone
127-
mkdir -p $INSTALLATION_DIR
127+
echo "\n=== Cloning vLLM repository ==="
128128
cd $INSTALLATION_DIR
129129
git clone $VLLM_GIT_URL vllm
130130
cd vllm
131131
git checkout $VLLM_REF
132-
# nvshmem fix - cherry-pick commit pinning pplx version
133-
# https://github.com/ai-dynamo/dynamo/actions/runs/17907241473/job/50910654042?pr=2969#step:8:280
134-
# remove when moving to next version of vllm
135-
# Configure git user for cherry-pick operation
136-
GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 906e461ed6ddccd3cc7b68fa72048d2d3fcbd72c
137-
138-
if [ "$ARCH" = "arm64" ]; then
139-
echo "Installing vllm for ARM64 architecture"
140-
141-
# Try to install specific PyTorch version first, fallback to latest nightly
142-
echo "Attempting to install pinned PyTorch nightly versions..."
143-
if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl; then
144-
echo "Pinned versions failed"
145-
exit 1
146-
# uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
147-
fi
148132

149-
python use_existing_torch.py
150-
uv pip install -r requirements/build.txt
133+
echo "\n=== Installing vLLM ==="
151134

152-
if [ "$EDITABLE" = "true" ]; then
153-
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -e . -v
154-
else
155-
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation . -v
156-
fi
135+
if [[ $VLLM_REF =~ ^v ]]; then
136+
# VLLM_REF starts with 'v' - use pip install with version tag
137+
echo "Installing vLLM $VLLM_REF from PyPI..."
138+
uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND
157139
else
158-
echo "Installing vllm for AMD64 architecture"
140+
# VLLM_REF does not start with 'v' - use git checkout path
141+
if [ "$ARCH" = "arm64" ]; then
142+
echo "Building vLLM from source for ARM64 architecture..."
143+
144+
# Try to install specific PyTorch version first, fallback to latest nightly
145+
echo "Attempting to install pinned PyTorch nightly versions..."
146+
if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl; then
147+
echo "Pinned versions failed"
148+
exit 1
149+
# uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
150+
fi
159151

160-
echo "Attempting to install pinned OpenAI version..."
161-
if ! uv pip install openai==1.99.9; then
162-
echo "Pinned versions failed"
163-
exit 1
164-
fi
152+
python use_existing_torch.py
153+
uv pip install -r requirements/build.txt
154+
155+
if [ "$EDITABLE" = "true" ]; then
156+
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -e . -v
157+
else
158+
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation . -v
159+
fi
165160

166-
export VLLM_PRECOMPILED_WHEEL_LOCATION="${VLLM_PRECOMPILED_WHEEL_LOCATION}"
161+
echo "\n=== Installing FlashInfer from source ==="
162+
cd $INSTALLATION_DIR
163+
git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
164+
cd flashinfer
165+
git checkout $FLASHINF_REF
166+
uv pip install -v --no-build-isolation .
167167

168-
if [ "$EDITABLE" = "true" ]; then
169-
uv pip install -e . --torch-backend=$TORCH_BACKEND
170168
else
171-
uv pip install . --torch-backend=$TORCH_BACKEND
172-
fi
173-
fi
169+
echo "Building vLLM from source for AMD64 architecture..."
174170

175-
# Install ep_kernels and DeepGEMM
176-
echo "Installing ep_kernels and DeepGEMM"
177-
cd tools/ep_kernels
178-
TORCH_CUDA_ARCH_LIST="9.0;10.0" bash install_python_libraries.sh # These libraries aren't pinned.
179-
cd ep_kernels_workspace
180-
git clone https://github.com/deepseek-ai/DeepGEMM.git
181-
cd DeepGEMM
182-
git checkout $DEEPGEMM_REF # Pin Version
171+
# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
172+
# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
173+
export VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"
183174

184-
sed -i 's|git@github.com:|https://github.com/|g' .gitmodules
185-
git submodule sync --recursive
186-
git submodule update --init --recursive
175+
if [ "$EDITABLE" = "true" ]; then
176+
uv pip install -e . --torch-backend=$TORCH_BACKEND
177+
else
178+
uv pip install . --torch-backend=$TORCH_BACKEND
179+
fi
187180

188-
# command for 03d0be3
189-
python setup.py install
181+
echo "\n=== Installing FlashInfer from PyPI ==="
182+
uv pip install flashinfer-python==$FLASHINF_REF
190183

191-
# new install command for post 03d0be3
192-
# cat install.sh
193-
# ./install.sh
184+
fi
185+
fi
194186

187+
echo "✓ vLLM installation completed"
195188

196-
# Install Flash Infer
197-
if [ "$ARCH" = "arm64" ]; then
198-
uv pip install flashinfer-python
189+
echo "\n=== Installing LMCache ==="
190+
if [ "$ARCH" = "amd64" ]; then
191+
# LMCache installation currently fails on arm64 due to CUDA dependency issues:
192+
# OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
193+
# TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
194+
uv pip install lmcache==0.3.3
195+
echo "✓ LMCache installed"
196+
else
197+
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
198+
fi
199+
200+
echo "\n=== Installing DeepGEMM ==="
201+
cd tools/
202+
if [ -n "$DEEPGEMM_REF" ]; then
203+
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
199204
else
200-
cd $INSTALLATION_DIR
201-
git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
202-
cd flashinfer
203-
git checkout $FLASHINF_REF
204-
uv pip install -v --no-build-isolation .
205+
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
205206
fi
207+
echo "✓ DeepGEMM installation completed"
208+
209+
echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
210+
cd ep_kernels/
211+
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
206212

207-
echo "vllm installation completed successfully"
213+
echo "\n✅ All installations completed successfully!"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ trtllm =[
5454
vllm = [
5555
"uvloop",
5656
"nixl<=0.4.1",
57-
"vllm[flashinfer]==0.10.1.1",
57+
"vllm[flashinfer]==0.10.2",
5858
]
5959

6060
sglang = [

0 commit comments

Comments
 (0)