11# SPDX-License-Identifier: Apache-2.0
22# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
4+ from collections .abc import Set as AbstractSet
45from functools import partial
56
67import numpy as np
2223from vllm .multimodal .inputs import MultiModalInputs
2324from vllm .multimodal .processing import BaseMultiModalProcessor , InputProcessingContext
2425from vllm .transformers_utils .tokenizer import (
25- AnyTokenizer ,
2626 MistralTokenizer ,
2727 cached_tokenizer_from_config ,
2828 encode_tokens ,
2929)
3030
3131from ....multimodal .utils import random_audio , random_image , random_video
32- from ...registry import HF_EXAMPLE_MODELS
32+ from ...registry import (
33+ _MULTIMODAL_EXAMPLE_MODELS ,
34+ _TRANSFORMERS_BACKEND_MODELS ,
35+ HF_EXAMPLE_MODELS ,
36+ )
3337
3438
3539def glm4_1v_patch_mm_data (mm_data : MultiModalDataDict ) -> MultiModalDataDict :
@@ -83,6 +87,119 @@ def create_metadata(frames: np.ndarray):
8387 return mm_data
8488
8589
90+ # For some multimodal models, tokenizer will always add bos_token
91+ # at the beginning of prompt by default, causing hf_processor outputs
92+ # incorrect token ids. So we need use `add_special_tokens=False` here
93+ # to leave bos_token to be added by the processor.
94+ _ADD_SPECIAL_TOKENS_OVERRIDES = {
95+ "ovis" : False ,
96+ "ovis2_5" : False ,
97+ "paligemma" : False ,
98+ "ultravox" : False ,
99+ "whisper" : False ,
100+ }
101+
102+ _IGNORE_MM_KEYS = {
103+ # In Ultravox, the audio_features can be different depending on padding
104+ # The slight difference should not be a problem though, since
105+ # attention_mask lets us ignore the difference.
106+ "ultravox" : {"audio_features" },
107+ }
108+
109+ MM_DATA_PATCHES = {
110+ # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
111+ "glm4v" : glm4_1v_patch_mm_data ,
112+ "glm4v_moe" : glm4_1v_patch_mm_data ,
113+ "qwen3_vl" : qwen3_vl_patch_mm_data ,
114+ "qwen3_vl_moe" : qwen3_vl_patch_mm_data ,
115+ }
116+
117+
118+ def _iter_model_ids_to_test (model_arch_list : AbstractSet [str ]):
119+ for model_arch in model_arch_list :
120+ model_info = HF_EXAMPLE_MODELS .get_hf_info (model_arch )
121+ yield model_info .default
122+
123+ for extra_type , extra_model_id in model_info .extras .items ():
124+ if "fp" in extra_type :
125+ continue # Redundant to test quantized models
126+
127+ yield extra_model_id
128+
129+
130+ def _get_model_ids_to_test (model_arch_list : AbstractSet [str ]):
131+ return list (_iter_model_ids_to_test (model_arch_list ))
132+
133+
134+ def get_model_ids_to_test ():
135+ transformers_arch_ids = {
136+ model_id
137+ for info in _TRANSFORMERS_BACKEND_MODELS .values ()
138+ for model_id in (info .default , * info .extras .values ())
139+ }
140+ vllm_only_archs = {
141+ arch
142+ for arch , info in _MULTIMODAL_EXAMPLE_MODELS .items ()
143+ if not any (
144+ model_id in transformers_arch_ids
145+ for model_id in (info .default , * info .extras .values ())
146+ )
147+ }
148+
149+ return _get_model_ids_to_test (vllm_only_archs )
150+
151+
152+ def get_text_token_prompts (
153+ processor : BaseMultiModalProcessor ,
154+ mm_data : MultiModalDataDict ,
155+ ):
156+ dummy_inputs = processor .dummy_inputs
157+ tokenizer = processor .info .get_tokenizer ()
158+ model_config = processor .info .ctx .model_config
159+
160+ model_type = model_config .hf_config .model_type
161+ if model_type in MM_DATA_PATCHES :
162+ mm_data = MM_DATA_PATCHES [model_type ](mm_data )
163+
164+ parsed_data = processor .data_parser .parse_mm_data (mm_data )
165+ mm_counts = {k : len (vs ) for k , vs in parsed_data .items ()}
166+
167+ text_prompt : str | None
168+ token_prompt : list [int ]
169+ if isinstance (tokenizer , MistralTokenizer ):
170+ images = parsed_data .get ("image" , [])
171+ request = ChatCompletionRequest (
172+ messages = [
173+ UserMessage (
174+ content = [
175+ TextChunk (text = "" ),
176+ * (ImageChunk (image = image ) for image in images ),
177+ ]
178+ ),
179+ ]
180+ )
181+ res = tokenizer .mistral .encode_chat_completion (request )
182+
183+ # Mistral does not support decode_tokens with skip_special_tokens=False
184+ text_prompt = None
185+ token_prompt = res .tokens
186+ else :
187+ inputs = dummy_inputs .get_dummy_processor_inputs (
188+ model_config .max_model_len ,
189+ mm_counts ,
190+ )
191+ assert isinstance (inputs .prompt , str )
192+
193+ text_prompt = inputs .prompt
194+ token_prompt = encode_tokens (
195+ tokenizer ,
196+ text_prompt ,
197+ add_special_tokens = _ADD_SPECIAL_TOKENS_OVERRIDES .get (model_type ),
198+ )
199+
200+ return text_prompt , token_prompt
201+
202+
86203def _test_processing_correctness (
87204 model_id_or_arch : str ,
88205 hit_rate : float ,
@@ -148,8 +265,6 @@ def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
148265
149266 baseline_processor = factories .build_processor (ctx , cache = None )
150267 cached_processor = factories .build_processor (ctx , cache = cache )
151- dummy_inputs = baseline_processor .dummy_inputs
152- tokenizer = baseline_processor .info .get_tokenizer ()
153268
154269 rng = np .random .RandomState (0 )
155270
@@ -175,29 +290,6 @@ def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
175290 for k , limit in limit_mm_per_prompt_ints .items ()
176291 }
177292
178- mm_counts = {k : len (vs ) for k , vs in mm_data .items ()}
179-
180- # Mistral chat outputs tokens directly, rather than text prompts
181- if isinstance (tokenizer , MistralTokenizer ):
182- images = mm_data .get ("image" , [])
183- request = ChatCompletionRequest (
184- messages = [
185- UserMessage (
186- content = [
187- TextChunk (text = "" ),
188- * (ImageChunk (image = image ) for image in images ),
189- ]
190- ),
191- ]
192- )
193- res = tokenizer .mistral .encode_chat_completion (request )
194- prompt = res .tokens
195- else :
196- prompt = dummy_inputs .get_dummy_processor_inputs (
197- model_config .max_model_len ,
198- mm_counts ,
199- ).prompt
200-
201293 # Drop unnecessary keys and test single -> multi conversion
202294 if rng .rand () < simplify_rate :
203295 for k in list (mm_data .keys ()):
@@ -208,68 +300,24 @@ def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
208300
209301 _test_processing_correctness_one (
210302 model_config ,
211- tokenizer ,
212- prompt ,
213303 mm_data ,
214304 baseline_processor ,
215305 cached_processor ,
216306 batch_idx ,
217307 )
218308
219309
220- # For some multimodal models, tokenizer will always add bos_token
221- # at the beginning of prompt by default, causing hf_processor outputs
222- # incorrect token ids. So we need use `add_special_tokens=False` here
223- # to leave bos_token to be added by the processor.
224- _ADD_SPECIAL_TOKENS_OVERRIDES = {
225- "ovis" : False ,
226- "ovis2_5" : False ,
227- "paligemma" : False ,
228- "ultravox" : False ,
229- "whisper" : False ,
230- }
231-
232- _IGNORE_MM_KEYS = {
233- # In Ultravox, the audio_features can be different depending on padding
234- # The slight difference should not be a problem though, since
235- # attention_mask lets us ignore the difference.
236- "ultravox" : {"audio_features" },
237- }
238-
239- MM_DATA_PATCHES = {
240- # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
241- "glm4v" : glm4_1v_patch_mm_data ,
242- "glm4v_moe" : glm4_1v_patch_mm_data ,
243- "qwen3_vl" : qwen3_vl_patch_mm_data ,
244- "qwen3_vl_moe" : qwen3_vl_patch_mm_data ,
245- }
246-
247-
248310def _test_processing_correctness_one (
249311 model_config : ModelConfig ,
250- tokenizer : AnyTokenizer ,
251- prompt : str | list [int ],
252312 mm_data : MultiModalDataDict ,
253313 baseline_processor : BaseMultiModalProcessor ,
254314 cached_processor : BaseMultiModalProcessor ,
255315 batch_idx : int ,
256316):
257317 model_type = model_config .hf_config .model_type
258- ignore_mm_keys = _IGNORE_MM_KEYS .get (model_type , set [str ]())
259- if model_type in MM_DATA_PATCHES :
260- mm_data = MM_DATA_PATCHES [model_type ](mm_data )
261318
262- if isinstance (prompt , str ):
263- text_prompt = prompt
264- token_prompt = encode_tokens (
265- tokenizer ,
266- prompt ,
267- add_special_tokens = _ADD_SPECIAL_TOKENS_OVERRIDES .get (model_type ),
268- )
269- else :
270- # Mistral does not support decode_tokens with skip_special_tokens=False
271- text_prompt = None
272- token_prompt = prompt
319+ text_prompt , token_prompt = get_text_token_prompts (baseline_processor , mm_data )
320+ ignore_mm_keys = _IGNORE_MM_KEYS .get (model_type , set [str ]())
273321
274322 baseline_tokenized_result = baseline_processor .apply (
275323 token_prompt ,
@@ -324,81 +372,7 @@ def _test_processing_correctness_one(
324372 )
325373
326374
327- @pytest .mark .parametrize (
328- "model_id" ,
329- [
330- "rhymes-ai/Aria" ,
331- "CohereForAI/aya-vision-8b" ,
332- "Open-Bee/Bee-8B-RL" ,
333- "Salesforce/blip2-opt-2.7b" ,
334- "facebook/chameleon-7b" ,
335- "CohereLabs/command-a-vision-07-2025" ,
336- "deepseek-ai/deepseek-vl2-tiny" ,
337- "deepseek-ai/DeepSeek-OCR" ,
338- "baidu/ERNIE-4.5-VL-28B-A3B-PT" ,
339- "adept/fuyu-8b" ,
340- "google/gemma-3-4b-it" ,
341- "google/gemma-3n-E2B-it" ,
342- "zai-org/glm-4v-9b" ,
343- "zai-org/GLM-4.1V-9B-Thinking" ,
344- "zai-org/GLM-4.5V" ,
345- "ibm-granite/granite-speech-3.3-2b" ,
346- "h2oai/h2ovl-mississippi-800m" ,
347- "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" ,
348- "HuggingFaceM4/Idefics3-8B-Llama3" ,
349- "internlm/Intern-S1" ,
350- "OpenGVLab/InternVL2-1B" ,
351- "OpenGVLab/InternVL3-1B" ,
352- "OpenGVLab/InternVL3_5-1B" ,
353- "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview" ,
354- "OpenGVLab/InternVL3_5-30B-A3B" ,
355- "Kwai-Keye/Keye-VL-8B-Preview" ,
356- "Kwai-Keye/Keye-VL-1_5-8B" ,
357- "moonshotai/Kimi-VL-A3B-Instruct" ,
358- "meta-llama/Llama-4-Scout-17B-16E-Instruct" ,
359- "llava-hf/llava-1.5-7b-hf" ,
360- "llava-hf/llava-v1.6-mistral-7b-hf" ,
361- "llava-hf/LLaVA-NeXT-Video-7B-hf" ,
362- "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" ,
363- "TIGER-Lab/Mantis-8B-siglip-llama3" ,
364- "mispeech/midashenglm-7b" ,
365- "openbmb/MiniCPM-Llama3-V-2_5" ,
366- "openbmb/MiniCPM-o-2_6" ,
367- "openbmb/MiniCPM-V-2_6" ,
368- "MiniMaxAI/MiniMax-VL-01" ,
369- "allenai/Molmo-7B-D-0924" ,
370- "allenai/Molmo-7B-O-0924" ,
371- "nvidia/NVLM-D-72B" ,
372- "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" ,
373- "AIDC-AI/Ovis1.6-Gemma2-9B" ,
374- "AIDC-AI/Ovis1.6-Llama3.2-3B" ,
375- "AIDC-AI/Ovis2-1B" ,
376- "AIDC-AI/Ovis2.5-2B" ,
377- "google/paligemma-3b-mix-224" ,
378- "google/paligemma2-3b-ft-docci-448" ,
379- "microsoft/Phi-3.5-vision-instruct" ,
380- "microsoft/Phi-4-multimodal-instruct" ,
381- "mistralai/Pixtral-12B-2409" ,
382- "mistral-community/pixtral-12b" ,
383- "Qwen/Qwen-VL-Chat" ,
384- "Qwen/Qwen2-VL-2B-Instruct" ,
385- "Qwen/Qwen2.5-VL-3B-Instruct" ,
386- "Qwen/Qwen2-Audio-7B-Instruct" ,
387- "Qwen/Qwen2.5-Omni-3B" ,
388- "Qwen/Qwen3-VL-4B-Instruct" ,
389- "Qwen/Qwen3-VL-30B-A3B-Instruct" ,
390- "Qwen/Qwen3-Omni-30B-A3B-Instruct" ,
391- "YannQi/R-4B" ,
392- "Skywork/Skywork-R1V-38B" ,
393- "HuggingFaceTB/SmolVLM2-2.2B-Instruct" ,
394- "stepfun-ai/step3" ,
395- "fixie-ai/ultravox-v0_5-llama-3_2-1b" ,
396- "openai/whisper-large-v3" ,
397- "omni-research/Tarsier-7b" ,
398- "omni-research/Tarsier2-Recap-7b" ,
399- "mistralai/Voxtral-Mini-3B-2507" ,
400- ],
401- )
375+ @pytest .mark .parametrize ("model_id" , get_model_ids_to_test ())
402376@pytest .mark .parametrize ("hit_rate" , [0.3 , 0.5 , 1.0 ])
403377@pytest .mark .parametrize ("num_batches" , [32 ])
404378@pytest .mark .parametrize ("simplify_rate" , [1.0 ])
@@ -409,7 +383,12 @@ def test_processing_correctness(
409383 simplify_rate : float ,
410384):
411385 if model_id == "google/gemma-3n-E2B-it" :
412- pytest .skip ("Skipping gemma-3n-E2B-it due to transformers #39911 bug." )
386+ pytest .skip ("Fix later" )
387+ if model_id == "OpenGVLab/InternVL2-2B" :
388+ pytest .skip ("Fix later" )
389+ if model_id == "jinaai/jina-reranker-m0" :
390+ pytest .skip ("Fix later" )
391+
413392 _test_processing_correctness (
414393 model_id ,
415394 hit_rate = hit_rate ,
0 commit comments