Refactor MistralTokenizer (vllm-project#26358)

juliendenize · 0xrushi · commit d30e9fd81b22 · 2025-10-25T22:59:54.000-04:00
Signed-off-by: Julien Denize &lt;julien.denize@mistral.ai&gt;
Signed-off-by: 0xrushi &lt;6279035+0xrushi@users.noreply.github.com&gt;
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
@@ -145,7 +145,7 @@ Supported models:
 Known issues:
 
 1. Mistral 7B struggles to generate parallel tool calls correctly.
-2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
+2. **For Transformers tokenization backend only**: Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
    much shorter than what vLLM generates. Since an exception is thrown when this condition
    is not met, the following additional chat templates are provided:
 
@@ -154,7 +154,14 @@ Known issues:
     * <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt
       when tools are provided, that results in much better reliability when working with parallel tool calling.
 
-Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+Recommended flags:
+
+1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend:
+
+    `--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral`
+
+2. To use the default Transformers tokenization backend:
+    `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
 
 ### Llama Models (`llama3_json`)
 
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
@@ -45,10 +45,12 @@ class ModelRequestData(NamedTuple):
 # Voxtral
 def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
     from mistral_common.audio import Audio
-    from mistral_common.protocol.instruct.messages import (
+    from mistral_common.protocol.instruct.chunk import (
         AudioChunk,
         RawAudio,
         TextChunk,
+    )
+    from mistral_common.protocol.instruct.messages import (
         UserMessage,
     )
     from mistral_common.protocol.instruct.request import ChatCompletionRequest
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -32,7 +32,7 @@ pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
 importlib_metadata; python_version < '3.10'
-mistral_common[image,audio] >= 1.8.2
+mistral_common[image,audio] >= 1.8.5
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
@@ -23,7 +23,7 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.2 # required for voxtral test
+mistral_common[image,audio] >= 1.8.5 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
diff --git a/requirements/test.in b/requirements/test.in
@@ -29,7 +29,7 @@ torchaudio==2.8.0
 torchvision==0.23.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.2 # required for voxtral test
+mistral_common[image,audio] >= 1.8.5 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -474,7 +474,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.8.2
+mistral-common==1.8.5
     # via -r requirements/test.in
 mlflow==2.22.0
     # via terratorch
@@ -1012,8 +1012,6 @@ sentence-transformers==3.2.1
     # via
     #   -r requirements/test.in
     #   mteb
-sentencepiece==0.2.0
-    # via mistral-common
 setuptools==77.0.3
     # via
     #   lightning-utilities
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
@@ -6,8 +6,7 @@
 from typing import Literal, Optional
 
 import pytest
-from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, SpecialTokens
-from mistral_common.tokens.tokenizers.tekken import SpecialTokenInfo, Tekkenizer
+from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
@@ -2119,34 +2118,9 @@ def test_apply_mistral_chat_template_thinking_chunk():
         },
         {"role": "user", "content": "Thanks, what is 3+3?"},
     ]
-
-    # TODO(Julien): upon model release change to a tokenizer already configured.
-    # =================================================================
     mistral_tokenizer = MistralTokenizer.from_pretrained(
-        "mistralai/Devstral-Small-2507"
-    )
-    assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
-    # Add think special tokens to the tokenizer
-    mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
-        rank=35, is_control=True, token_str=SpecialTokens.begin_think.value
+        "mistralai/Magistral-Small-2509"
     )
-    mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
-        rank=36, is_control=True, token_str=SpecialTokens.end_think.value
-    )
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
-        k: v
-        for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
-        if v not in {35, 36}
-    }
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
-        SpecialTokens.begin_think.value
-    ] = 35
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
-        SpecialTokens.end_think.value
-    ] = 36
-    mistral_tokenizer.instruct.BEGIN_THINK = 35
-    mistral_tokenizer.instruct.END_THINK = 36
-    # =================================================================
 
     tokens_ids = apply_mistral_chat_template(
         mistral_tokenizer, messages, chat_template=None, tools=None
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
@@ -6,7 +6,7 @@
 
 import pytest
 from mistral_common.multimodal import download_image
-from mistral_common.protocol.instruct.messages import ImageURLChunk
+from mistral_common.protocol.instruct.chunk import ImageURLChunk
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
@@ -6,12 +6,8 @@
 import pytest
 import pytest_asyncio
 from mistral_common.audio import Audio
-from mistral_common.protocol.instruct.messages import (
-    AudioChunk,
-    RawAudio,
-    TextChunk,
-    UserMessage,
-)
+from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
 
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
@@ -6,7 +6,8 @@
 
 import numpy as np
 import pytest
-from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
+from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
 
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -9,7 +9,8 @@
 import numpy as np
 import pytest
 import torch.nn as nn
-from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
+from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
+from mistral_common.protocol.instruct.messages import UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
 
diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from mistral_common.tokens.tokenizers.base import SpecialTokens
-from mistral_common.tokens.tokenizers.tekken import SpecialTokenInfo, Tekkenizer
 
 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@@ -14,33 +12,9 @@
 
 @pytest.fixture(scope="module")
 def mistral_tokenizer():
-    # TODO(Julien): upon model release change to a tokenizer already configured.
-    # =================================================================
     mistral_tokenizer = MistralTokenizer.from_pretrained(
-        "mistralai/Devstral-Small-2507"
+        "mistralai/Magistral-Small-2509"
     )
-    assert isinstance(mistral_tokenizer.tokenizer, Tekkenizer)
-    # Add think special tokens to the tokenizer
-    mistral_tokenizer.tokenizer._all_special_tokens[35] = SpecialTokenInfo(
-        rank=35, is_control=True, token_str=SpecialTokens.begin_think.value
-    )
-    mistral_tokenizer.tokenizer._all_special_tokens[36] = SpecialTokenInfo(
-        rank=36, is_control=True, token_str=SpecialTokens.end_think.value
-    )
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab = {
-        k: v
-        for k, v in mistral_tokenizer.tokenizer._special_tokens_reverse_vocab.items()
-        if v not in {35, 36}
-    }
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
-        SpecialTokens.begin_think.value
-    ] = 35
-    mistral_tokenizer.tokenizer._special_tokens_reverse_vocab[
-        SpecialTokens.end_think.value
-    ] = 36
-    mistral_tokenizer.instruct.BEGIN_THINK = 35
-    mistral_tokenizer.instruct.END_THINK = 36
-    # =================================================================
     return mistral_tokenizer
 
 
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py