Skip to content

Commit a333819

Browse files
ywang96shreyankg
authored andcommitted
[Misc] Fix input processing for Ultravox (vllm-project#13871)
1 parent 0ca69bd commit a333819

File tree

3 files changed

+6
-15
lines changed

3 files changed

+6
-15
lines changed

tests/models/multimodal/processing/test_common.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ def _test_processing_correctness(
8383
}
8484

8585
tokenizer_encode_kwargs = {}
86-
if model_config.hf_config.model_type in ("mllama", "whisper"):
87-
# For some encoder-decoder models, tokenizer will always add bos_token
86+
if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
87+
# For some multimodal models, tokenizer will always add bos_token
8888
# at the beginning of prompt by default, causing hf_processor outputs
8989
# incorrect token ids. So we need use `add_special_tokens=False` here
9090
# to leave bos_token to be added by the processor.
@@ -172,7 +172,7 @@ def _test_processing_correctness(
172172
"Qwen/Qwen2-VL-2B-Instruct",
173173
"Qwen/Qwen2.5-VL-3B-Instruct",
174174
"Qwen/Qwen2-Audio-7B-Instruct",
175-
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
175+
"fixie-ai/ultravox-v0_4",
176176
"openai/whisper-large-v3",
177177
])
178178
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])

tests/models/registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def check_available_online(
284284
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
285285
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
286286
min_transformers_version="4.49"), # noqa: E501
287-
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
287+
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_4",
288288
trust_remote_code=True),
289289
# [Encoder-decoder]
290290
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501

vllm/model_executor/models/ultravox.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ def _call_hf_processor(
146146
) -> BatchFeature:
147147
# Text-only input not supported in composite processor
148148
if not mm_data or not mm_data.get("audios", []):
149-
prompt_ids = self.info.get_tokenizer().encode(prompt)
149+
prompt_ids = self.info.get_tokenizer().encode(
150+
prompt, add_special_tokens=False)
150151
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
151152
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
152153

@@ -185,16 +186,6 @@ def _call_hf_processor(
185186
)
186187
return BatchFeature(combined_outputs)
187188

188-
def _apply_hf_processor_tokens_only(
189-
self,
190-
prompt_tokens: list[int],
191-
) -> list[int]:
192-
# HF processor omits bos_token_id by setting add_special_tokens=False
193-
tokenizer = self.info.get_tokenizer()
194-
assert prompt_tokens[0] == tokenizer.bos_token_id
195-
196-
return prompt_tokens[1:]
197-
198189
def _get_mm_fields_config(
199190
self,
200191
hf_inputs: BatchFeature,

0 commit comments

Comments
 (0)