Skip to content

Commit e82bc4e

Browse files
chore: update vLLM to 0.10.0 (#2114)
Co-authored-by: alec-flowers <aflowers@nvidia.com>
1 parent 615580d commit e82bc4e

File tree

7 files changed

+27
-25
lines changed

7 files changed

+27
-25
lines changed

components/backends/vllm/src/dynamo/vllm/args.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,11 @@ def overwrite_args(config):
207207

208208
defaults = {
209209
"task": "generate",
210-
"skip_tokenizer_init": True,
210+
# As of vLLM >=0.10.0 the engine unconditionally calls
211+
# `sampling_params.update_from_tokenizer(...)`, so we can no longer
212+
# skip tokenizer initialisation. Setting this to **False** avoids
213+
# a NoneType error when the processor accesses the tokenizer.
214+
"skip_tokenizer_init": False,
211215
"disable_log_requests": True,
212216
# KV routing relies on logging KV metrics
213217
"disable_log_stats": False,

components/backends/vllm/src/dynamo/vllm/handlers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ async def generate(self, request):
110110
prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
111111

112112
sampling_params = SamplingParams(**self.default_sampling_params)
113+
114+
sampling_params.detokenize = False
113115
for key, value in request["sampling_options"].items():
114116
if value is not None and hasattr(sampling_params, key):
115117
setattr(sampling_params, key, value)

components/backends/vllm/src/dynamo/vllm/publisher.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def record(
2525
self,
2626
scheduler_stats: Optional[SchedulerStats],
2727
iteration_stats: Optional[IterationStats],
28+
engine_idx: int = 0,
2829
):
2930
pass
3031

@@ -51,7 +52,10 @@ def set_num_request_total_slots(self, request_total_slots):
5152
self.request_total_slots = request_total_slots
5253

5354
def record(
54-
self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]
55+
self,
56+
scheduler_stats: SchedulerStats,
57+
iteration_stats: Optional[IterationStats],
58+
engine_idx: int = 0,
5559
):
5660
# request_total_slots and kv_total_blocks are properties of model + gpu
5761
# we should only publish them once, not every metric update

container/Dockerfile.vllm

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,15 @@ ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
1010
ARG RELEASE_BUILD
1111
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
1212
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
13-
ARG VLLM_REF="059d4cd"
14-
ARG TORCH_BACKEND="cu128"
15-
16-
# After this commit deepgemm API changed
17-
# 1.0.0 -> 2.0.0
18-
ARG DEEPGEMM_REF="03d0be3"
19-
ARG FLASHINF_REF="1d72ed4"
2013

2114
# Make sure to update the dependency version in pyproject.toml when updating this
22-
ARG VLLM_VERSION="0.9.2"
15+
ARG VLLM_REF="v0.10.0"
16+
ARG TORCH_BACKEND="cu128"
17+
18+
# Match 0.10.0 vLLM release
19+
# https://github.com/vllm-project/vllm/releases/tag/v0.10.0
20+
ARG DEEPGEMM_REF="1876566"
21+
ARG FLASHINF_REF="v0.2.8rc1"
2322

2423
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
2524
# ARCH: Used for package suffixes (e.g., amd64, arm64)
@@ -42,11 +41,10 @@ ARG ARCH_ALT=x86_64
4241

4342
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
4443

45-
# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND, VLLM_VERSION so they're available in this stage
44+
# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND so they're available in this stage
4645
ARG ARCH
4746
ARG ARCH_ALT
4847
ARG TORCH_BACKEND
49-
ARG VLLM_VERSION
5048

5149
USER root
5250
ARG PYTHON_VERSION=3.12
@@ -195,15 +193,11 @@ ENV CUDA_HOME=/usr/local/cuda
195193

196194
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
197195
--mount=type=cache,target=/root/.cache/uv \
198-
if [ "$ARCH" = "arm64" ]; then \
199196
# TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
200197
# Should be able to select how you want your build to go
201198
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
202199
chmod +x /tmp/install_vllm.sh && \
203-
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND; \
204-
else \
205-
uv pip install "vllm==${VLLM_VERSION}"; \
206-
fi
200+
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND;
207201

208202
ENV LD_LIBRARY_PATH=\
209203
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
@@ -464,9 +458,7 @@ COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
464458
COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
465459

466460
# Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
467-
RUN if [ "$ARCH" = "arm64" ]; then \
468-
COPY --from=base /opt/vllm /opt/vllm; \
469-
fi
461+
COPY --from=base /opt/vllm /opt/vllm
470462

471463
ENV LD_LIBRARY_PATH=\
472464
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\

container/deps/vllm/install_vllm.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,12 @@ set -euo pipefail
2020

2121
# Parse arguments
2222
EDITABLE=true
23-
VLLM_REF="059d4cd"
23+
VLLM_REF="v0.10.0"
2424
MAX_JOBS=16
2525
INSTALLATION_DIR=/tmp
2626
ARCH=$(uname -m)
27-
DEEPGEMM_REF="6c9558e"
28-
FLASHINF_REF="1d72ed4"
27+
DEEPGEMM_REF="1876566"
28+
FLASHINF_REF="v0.2.8rc1"
2929
TORCH_BACKEND="cu128"
3030

3131
# Convert x86_64 to amd64 for consistency with Docker ARG

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ trtllm =[
6767
vllm = [
6868
"uvloop",
6969
"nixl",
70-
"vllm==0.9.2",
70+
"vllm==0.10.0",
7171
]
7272

7373
sglang = [

tests/serve/test_vllm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class VLLMConfig:
5959
endpoints: List[str]
6060
response_handlers: List[Callable[[Any], str]]
6161
model: str
62-
timeout: int = 60
62+
timeout: int = 120
6363
delayed_start: int = 0
6464

6565

0 commit comments

Comments
 (0)