Skip to content

Commit a9e879b

Browse files
[Misc] Clean up MiniCPM-V/O code (#15337)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 3e2f37a commit a9e879b

File tree

7 files changed

+531
-661
lines changed

7 files changed

+531
-661
lines changed

examples/offline_inference/vision_language.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ def run_llava_next_video(questions: list[str],
361361
engine_args = EngineArgs(
362362
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
363363
max_model_len=8192,
364+
max_num_seqs=2,
364365
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
365366
)
366367

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -163,24 +163,24 @@
163163
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
164164
),
165165
#### Extended model tests
166-
# "aria": VLMTestInfo(
167-
# models=["rhymes-ai/Aria"],
168-
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
169-
# prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
170-
# img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
171-
# max_model_len=4096,
172-
# max_num_seqs=2,
173-
# auto_cls=AutoModelForImageTextToText,
174-
# single_image_prompts=IMAGE_ASSETS.prompts({
175-
# "stop_sign": "<vlm_image>Please describe the image shortly.",
176-
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
177-
# }),
178-
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
179-
# stop_str=["<|im_end|>"],
180-
# image_size_factors=[(0.10, 0.15)],
181-
# max_tokens=64,
182-
# marks=[large_gpu_mark(min_gb=64)],
183-
# ),
166+
"aria": VLMTestInfo(
167+
models=["rhymes-ai/Aria"],
168+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
169+
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
170+
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
171+
max_model_len=4096,
172+
max_num_seqs=2,
173+
auto_cls=AutoModelForImageTextToText,
174+
single_image_prompts=IMAGE_ASSETS.prompts({
175+
"stop_sign": "<vlm_image>Please describe the image shortly.",
176+
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
177+
}),
178+
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
179+
stop_str=["<|im_end|>"],
180+
image_size_factors=[(0.10, 0.15)],
181+
max_tokens=64,
182+
marks=[large_gpu_mark(min_gb=64)],
183+
),
184184
"blip2": VLMTestInfo(
185185
models=["Salesforce/blip2-opt-2.7b"],
186186
test_type=VLMTestType.IMAGE,
@@ -352,6 +352,7 @@
352352
prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
353353
num_video_frames=16,
354354
max_model_len=4096,
355+
max_num_seqs=2,
355356
auto_cls=AutoModelForVision2Seq,
356357
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
357358
),
@@ -384,25 +385,49 @@
384385
),
385386
"minicpmo_26": VLMTestInfo(
386387
models=["openbmb/MiniCPM-o-2_6"],
387-
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
388+
test_type=(VLMTestType.IMAGE),
389+
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
390+
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
391+
max_model_len=4096,
392+
max_num_seqs=2,
393+
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
394+
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
395+
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
396+
),
397+
"minicpmo_26_multi_image": VLMTestInfo(
398+
models=["openbmb/MiniCPM-o-2_6"],
399+
test_type=(VLMTestType.MULTI_IMAGE),
388400
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
389401
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
390402
max_model_len=4096,
391403
max_num_seqs=2,
392404
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
393405
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
394406
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
407+
marks=[large_gpu_mark(min_gb=32)],
395408
),
396409
"minicpmv_26": VLMTestInfo(
397410
models=["openbmb/MiniCPM-V-2_6"],
398-
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
411+
test_type=(VLMTestType.IMAGE),
412+
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
413+
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
414+
max_model_len=4096,
415+
max_num_seqs=2,
416+
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
417+
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
418+
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
419+
),
420+
"minicpmv_26_multi_image": VLMTestInfo(
421+
models=["openbmb/MiniCPM-V-2_6"],
422+
test_type=(VLMTestType.MULTI_IMAGE),
399423
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
400424
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
401425
max_model_len=4096,
402426
max_num_seqs=2,
403427
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
404428
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
405429
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
430+
marks=[large_gpu_mark(min_gb=32)],
406431
),
407432
"molmo": VLMTestInfo(
408433
models=["allenai/Molmo-7B-D-0924"],

tests/models/multimodal/processing/test_common.py

Lines changed: 41 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
22

3-
import copy
43
from functools import partial
54
from typing import Optional, Union
65

@@ -29,7 +28,7 @@ def _test_processing_correctness(
2928
hit_rate: float,
3029
num_batches: int,
3130
simplify_rate: float,
32-
ignore_mm_keys: Optional[list[str]] = None,
31+
ignore_mm_keys: Optional[set[str]] = None,
3332
):
3433
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
3534
model_info.check_available_online(on_fail="skip")
@@ -145,7 +144,7 @@ def _test_processing_correctness_hf(
145144
baseline_processor: BaseMultiModalProcessor,
146145
cached_processor: BaseMultiModalProcessor,
147146
batch_idx: int,
148-
ignore_mm_keys: Optional[list[str]] = None,
147+
ignore_mm_keys: Optional[set[str]] = None,
149148
):
150149
if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
151150
# For some multimodal models, tokenizer will always add bos_token
@@ -167,35 +166,38 @@ def _test_processing_correctness_hf(
167166
hf_processor_mm_kwargs={},
168167
)
169168

170-
assert _inputs_equal(
169+
_assert_inputs_equal(
171170
baseline_result,
172171
cached_result,
173-
ignore_mm_keys,
174-
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
172+
ignore_mm_keys=ignore_mm_keys,
173+
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
174+
)
175175

176176
baseline_tokenized_result = baseline_processor.apply(
177177
token_prompt,
178178
mm_data=mm_data,
179179
hf_processor_mm_kwargs={},
180180
)
181181

182-
assert _inputs_equal(
182+
_assert_inputs_equal(
183183
baseline_result,
184184
baseline_tokenized_result,
185-
ignore_mm_keys,
186-
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
185+
ignore_mm_keys=ignore_mm_keys,
186+
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
187+
)
187188

188189
cached_tokenized_result = cached_processor.apply(
189190
token_prompt,
190191
mm_data=mm_data,
191192
hf_processor_mm_kwargs={},
192193
)
193194

194-
assert _inputs_equal(
195+
_assert_inputs_equal(
195196
cached_result,
196197
cached_tokenized_result,
197-
ignore_mm_keys,
198-
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
198+
ignore_mm_keys=ignore_mm_keys,
199+
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
200+
)
199201

200202

201203
def _test_processing_correctness_mistral(
@@ -206,7 +208,7 @@ def _test_processing_correctness_mistral(
206208
baseline_processor: BaseMultiModalProcessor,
207209
cached_processor: BaseMultiModalProcessor,
208210
batch_idx: int,
209-
ignore_mm_keys: Optional[list[str]] = None,
211+
ignore_mm_keys: Optional[set[str]] = None,
210212
):
211213
images = mm_data.get("image", [])
212214
if not isinstance(images, list):
@@ -233,11 +235,12 @@ def _test_processing_correctness_mistral(
233235
hf_processor_mm_kwargs={},
234236
)
235237

236-
assert _inputs_equal(
238+
_assert_inputs_equal(
237239
baseline_tokenized_result,
238240
cached_tokenized_result,
239-
ignore_mm_keys,
240-
), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
241+
ignore_mm_keys=ignore_mm_keys,
242+
msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
243+
)
241244

242245

243246
# yapf: disable
@@ -261,6 +264,7 @@ def _test_processing_correctness_mistral(
261264
"TIGER-Lab/Mantis-8B-siglip-llama3",
262265
"mistralai/Pixtral-12B-2409",
263266
"mistral-community/pixtral-12b",
267+
"openbmb/MiniCPM-Llama3-V-2_5",
264268
"openbmb/MiniCPM-o-2_6",
265269
"openbmb/MiniCPM-V-2_6",
266270
"allenai/Molmo-7B-D-0924",
@@ -290,7 +294,7 @@ def test_processing_correctness(
290294
# In Ultravox, the audio_features can be different depending on padding
291295
# The slight difference should not be a problem though, since
292296
# attention_mask lets us ignore the difference.
293-
ignore_mm_keys = ['audio_features']
297+
ignore_mm_keys = {"audio_features"}
294298

295299
_test_processing_correctness(
296300
model_id,
@@ -328,38 +332,26 @@ def test_processing_correctness_phi3v(
328332
)
329333

330334

331-
def _inputs_equal(
335+
def _assert_inputs_equal(
332336
a: MultiModalInputs,
333337
b: MultiModalInputs,
334-
ignore_mm_keys: Optional[list[str]] = None,
338+
*,
339+
ignore_mm_keys: Optional[set[str]] = None,
340+
msg: str = "",
335341
):
336-
return _drop_mm_kwargs_keys(a, ignore_mm_keys) == _drop_mm_kwargs_keys(
337-
b, ignore_mm_keys)
338-
339-
340-
def _drop_mm_kwargs_keys(
341-
result: MultiModalInputs,
342-
ignore_mm_keys: Optional[list[str]] = None,
343-
) -> MultiModalInputs:
344-
"""Drop specified keys from result['mm_kwargs'].
345-
346-
This is mainly to avoid doing exact match of audio_features in ultravox.
347-
348-
Args:
349-
result: Result to drop keys from
350-
ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
351-
"""
352-
if not ignore_mm_keys:
353-
return result
354-
355-
if 'mm_kwargs' in result:
356-
result = copy.deepcopy(result)
357-
mm_kwargs = result['mm_kwargs']
358-
for key in ignore_mm_keys:
359-
mm_kwargs.pop(key, None)
360-
for items in mm_kwargs._items_by_modality.values():
361-
for item in items:
362-
for key in ignore_mm_keys:
363-
item.pop(key, None)
364-
365-
return result
342+
if ignore_mm_keys is None:
343+
ignore_mm_keys = set()
344+
345+
if msg is None:
346+
assert "mm_kwargs" in a and "mm_kwargs" in b
347+
else:
348+
assert "mm_kwargs" in a and "mm_kwargs" in b, msg
349+
350+
for key in ignore_mm_keys:
351+
a["mm_kwargs"].pop(key, None)
352+
b["mm_kwargs"].pop(key, None)
353+
354+
if msg is None:
355+
assert a == b
356+
else:
357+
assert a == b, msg

vllm/model_executor/models/gemma3_mm.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,8 +295,6 @@ def _call_hf_processor(
295295

296296
# HF processor pops the `num_crops` kwarg, which is needed by vLLM
297297
if (images := mm_data.get("images")) is not None:
298-
assert isinstance(images, list)
299-
300298
parsed_images = (self._get_data_parser().parse_mm_data({
301299
"image":
302300
images

0 commit comments

Comments
 (0)