Skip to content

Commit 56ae27d

Browse files
Merge branch 'main' into feature/dynamic-inductor-partition-rules
2 parents 14b6521 + c6187f5 commit 56ae27d

File tree

28 files changed

+2466
-589
lines changed

28 files changed

+2466
-589
lines changed

docker/Dockerfile

Lines changed: 8 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -356,75 +356,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
356356
uv pip install --system dist/*.whl --verbose \
357357
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
358358

359-
# If we need to build FlashInfer wheel before its release:
360-
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
361-
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
362-
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
363-
# $ cd flashinfer
364-
# $ git checkout v0.2.6.post1
365-
# $ python -m flashinfer.aot
366-
# $ python -m build --no-isolation --wheel
367-
# $ ls -la dist
368-
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
369-
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
370-
371-
# Install FlashInfer from source
372-
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
373-
# Keep this in sync with "flashinfer" extra in setup.py
374-
ARG FLASHINFER_GIT_REF="v0.4.0"
375-
# Flag to control whether to compile FlashInfer AOT kernels
376-
# Set to "true" to enable AOT compilation:
377-
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
378-
ARG FLASHINFER_AOT_COMPILE=false
379-
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
380-
. /etc/environment
381-
git clone --depth 1 --recursive --shallow-submodules \
382-
--branch ${FLASHINFER_GIT_REF} \
383-
${FLASHINFER_GIT_REPO} flashinfer
384-
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
385-
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
386-
if [[ "${CUDA_VERSION}" == 11.* ]]; then
387-
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
388-
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
389-
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
390-
else
391-
# CUDA 12.8+ supports 10.0a and 12.0
392-
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
393-
fi
394-
pushd flashinfer
395-
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [ "${FLASHINFER_GIT_REF}" = "v0.3.1" ]; then
396-
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
397-
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
398-
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
399-
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
400-
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
401-
# Download pre-compiled cubins
402-
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
403-
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
404-
fi
405-
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
406-
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
407-
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
408-
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
409-
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
410-
# Build AOT kernels
411-
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
412-
python3 -m flashinfer.aot
413-
# Install with no-build-isolation since we already built AOT kernels
414-
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
415-
uv pip install --system --no-build-isolation . \
416-
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
417-
# Download pre-compiled cubins
418-
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
419-
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
420-
else
421-
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
422-
uv pip install --system . \
423-
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
424-
fi
425-
popd
426-
rm -rf flashinfer
427-
BASH
359+
# Install FlashInfer pre-compiled kernel cache and binaries
360+
# https://docs.flashinfer.ai/installation.html
361+
RUN --mount=type=cache,target=/root/.cache/uv \
362+
uv pip install --system flashinfer-cubin==0.4.0 \
363+
&& uv pip install --system flashinfer-jit-cache==0.4.0 \
364+
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
365+
&& flashinfer show-config
366+
428367
COPY examples examples
429368
COPY benchmarks benchmarks
430369
COPY ./vllm/collect_env.py .

docs/features/tool_calling.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ Supported models:
145145
Known issues:
146146

147147
1. Mistral 7B struggles to generate parallel tool calls correctly.
148-
2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
148+
2. **For Transformers tokenization backend only**: Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
149149
much shorter than what vLLM generates. Since an exception is thrown when this condition
150150
is not met, the following additional chat templates are provided:
151151

@@ -154,7 +154,14 @@ Known issues:
154154
* <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt
155155
when tools are provided, that results in much better reliability when working with parallel tool calling.
156156

157-
Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
157+
Recommended flags:
158+
159+
1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend:
160+
161+
`--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral`
162+
163+
2. To use the default Transformers tokenization backend:
164+
`--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
158165

159166
### Llama Models (`llama3_json`)
160167

examples/offline_inference/audio_language.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,12 @@ class ModelRequestData(NamedTuple):
4545
# Voxtral
4646
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
4747
from mistral_common.audio import Audio
48-
from mistral_common.protocol.instruct.messages import (
48+
from mistral_common.protocol.instruct.chunk import (
4949
AudioChunk,
5050
RawAudio,
5151
TextChunk,
52+
)
53+
from mistral_common.protocol.instruct.messages import (
5254
UserMessage,
5355
)
5456
from mistral_common.protocol.instruct.request import ChatCompletionRequest

requirements/common.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ pyzmq >= 25.0.0
3232
msgspec
3333
gguf >= 0.13.0
3434
importlib_metadata; python_version < '3.10'
35-
mistral_common[image,audio] >= 1.8.2
35+
mistral_common[image,audio] >= 1.8.5
3636
opencv-python-headless >= 4.11.0 # required for video IO
3737
pyyaml
3838
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12

requirements/cuda.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@ torchaudio==2.8.0
1111
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
1212
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
1313
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
14+
# FlashInfer should be updated together with the Dockerfile
15+
flashinfer-python==0.4.0

requirements/nightly_torch_test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jiwer # required for audio tests
2323
timm # required for internvl test
2424
transformers_stream_generator # required for qwen-vl test
2525
matplotlib # required for qwen-vl test
26-
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
26+
mistral_common[image,audio] >= 1.8.5 # required for voxtral test
2727
num2words # required for smolvlm test
2828
opencv-python-headless >= 4.11.0 # required for video test
2929
datamodel_code_generator # required for minicpm3 test

requirements/test.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ torchaudio==2.8.0
2929
torchvision==0.23.0
3030
transformers_stream_generator # required for qwen-vl test
3131
matplotlib # required for qwen-vl test
32-
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
32+
mistral_common[image,audio] >= 1.8.5 # required for voxtral test
3333
num2words # required for smolvlm test
3434
open_clip_torch==2.32.0 # Required for nemotron_vl test
3535
opencv-python-headless >= 4.11.0 # required for video test

requirements/test.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ mbstrdecoder==1.1.3
474474
# typepy
475475
mdurl==0.1.2
476476
# via markdown-it-py
477-
mistral-common==1.8.2
477+
mistral-common==1.8.5
478478
# via -r requirements/test.in
479479
mlflow==2.22.0
480480
# via terratorch
@@ -1012,8 +1012,6 @@ sentence-transformers==3.2.1
10121012
# via
10131013
# -r requirements/test.in
10141014
# mteb
1015-
sentencepiece==0.2.0
1016-
# via mistral-common
10171015
setuptools==77.0.3
10181016
# via
10191017
# lightning-utilities

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -714,8 +714,7 @@ def _read_requirements(filename: str) -> list[str]:
714714
"mistral_common[audio]",
715715
], # Required for audio processing
716716
"video": [], # Kept for backwards compatibility
717-
# FlashInfer should be updated together with the Dockerfile
718-
"flashinfer": ["flashinfer-python==0.4.0"],
717+
"flashinfer": [], # Kept for backwards compatibility
719718
# Optional deps for AMD FP4 quantization support
720719
"petit-kernel": ["petit-kernel"],
721720
},

tests/entrypoints/test_chat_utils.py

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
from typing import Literal, Optional
77

88
import pytest
9-
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, SpecialTokens
10-
from mistral_common.tokens.tokenizers.tekken import SpecialTokenInfo, Tekkenizer
9+
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
1110

1211
from vllm.assets.audio import AudioAsset
1312
from vllm.assets.image import ImageAsset
@@ -2119,34 +2118,9 @@ def test_apply_mistral_chat_template_thinking_chunk():
21192118
},
21202119
{"role": "user", "content": "Thanks, what is 3+3?"},
21212120
]
2122-
2123-
# TODO(Julien): upon model release change to a tokenizer already configured.
2124-
# =================================================================
21252121
mistral_tokenizer = MistralTokenizer.from_pretrained(
2126-
"mistralai/Devstral-Small-2507"
2127-
)
2128-
assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
2129-
# Add think special tokens to the tokenizer
2130-
mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
2131-
rank=35, is_control=True, token_str=SpecialTokens.begin_think.value
2122+
"mistralai/Magistral-Small-2509"
21322123
)
2133-
mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
2134-
rank=36, is_control=True, token_str=SpecialTokens.end_think.value
2135-
)
2136-
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
2137-
k: v
2138-
for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
2139-
if v not in {35, 36}
2140-
}
2141-
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
2142-
SpecialTokens.begin_think.value
2143-
] = 35
2144-
mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
2145-
SpecialTokens.end_think.value
2146-
] = 36
2147-
mistral_tokenizer.instruct.BEGIN_THINK = 35
2148-
mistral_tokenizer.instruct.END_THINK = 36
2149-
# =================================================================
21502124

21512125
tokens_ids = apply_mistral_chat_template(
21522126
mistral_tokenizer, messages, chat_template=None, tools=None

0 commit comments

Comments
 (0)