Skip to content

Commit 54d929d

Browse files
committed
wip
Signed-off-by: Julien Denize <julien.denize@mistral.ai>
1 parent 95ba523 commit 54d929d

File tree

4 files changed

+8
-4
lines changed

4 files changed

+8
-4
lines changed

tests/models/language/generation/test_mistral.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def test_mistral_format(
208208
with vllm_runner(
209209
model,
210210
dtype=dtype,
211-
tokenizer_mode="auto",
211+
tokenizer_mode="hf",
212212
load_format="safetensors",
213213
config_format="hf",
214214
) as hf_format_model:

tests/models/quantization/test_bitsandbytes.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def test_4bit_bnb_moe_model(
155155
quantization="bitsandbytes",
156156
enforce_eager=False,
157157
default_torch_num_threads=1,
158+
tokenizer_mode="hf",
158159
) as llm:
159160
vllm_outputs = llm.generate_greedy_logprobs(
160161
example_prompts, max_tokens=32, num_logprobs=5
@@ -204,6 +205,7 @@ def test_4bit_bnb_embedding_model(
204205
gpu_memory_utilization=0.5,
205206
quantization="bitsandbytes",
206207
default_torch_num_threads=1,
208+
tokenizer_mode="hf",
207209
) as vllm_model:
208210
vllm_outputs = vllm_model.embed(example_prompts)
209211

@@ -256,6 +258,7 @@ def validate_generated_texts(
256258
tensor_parallel_size=vllm_tp_size,
257259
enforce_eager=False,
258260
default_torch_num_threads=1,
261+
tokenizer_mode="hf",
259262
) as llm:
260263
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
261264
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

vllm/config/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ class ModelConfig:
128128
name or path will be used."""
129129
tokenizer_mode: TokenizerMode = "auto"
130130
"""Tokenizer mode:\n
131-
- "auto" will use the fast tokenizer if available.\n
131+
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
132+
- "hf" will use the fast tokenizer if available.\n
132133
- "slow" will always use the slow tokenizer.\n
133134
- "mistral" will always use the tokenizer from `mistral_common`.\n
134135
- "custom" will use --tokenizer to select the preregistered tokenizer."""

vllm/model_executor/model_loader/default_loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,11 @@ def _prepare_weights(
108108
)
109109
)
110110
> 0
111-
else "auto"
111+
else "hf"
112112
)
113113

114114
# Some quantized models use .pt files for storing the weights.
115-
if load_format in ["auto", "hf"]:
115+
if load_format in "hf":
116116
allow_patterns = ["*.safetensors", "*.bin"]
117117
elif load_format == "safetensors" or load_format == "fastsafetensors":
118118
use_safetensors = True

0 commit comments

Comments
 (0)