vllm-project · DarkLight1337 · Mar 25, 2025 · Mar 22, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
@@ -361,6 +361,7 @@ def run_llava_next_video(questions: list[str],
     engine_args = EngineArgs(
         model="llava-hf/LLaVA-NeXT-Video-7B-hf",
         max_model_len=8192,
+        max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 

@@ -163,24 +163,24 @@
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     #### Extended model tests
-    # "aria": VLMTestInfo(
-    #     models=["rhymes-ai/Aria"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
-    #     img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
-    #     max_model_len=4096,
-    #     max_num_seqs=2,
-    #     auto_cls=AutoModelForImageTextToText,
-    #     single_image_prompts=IMAGE_ASSETS.prompts({
-    #         "stop_sign": "<vlm_image>Please describe the image shortly.",
-    #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
-    #     }),
-    #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-    #     stop_str=["<|im_end|>"],
-    #     image_size_factors=[(0.10, 0.15)],
-    #     max_tokens=64,
-    #     marks=[large_gpu_mark(min_gb=64)],
-    # ),
+    "aria": VLMTestInfo(
+        models=["rhymes-ai/Aria"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<vlm_image>Please describe the image shortly.",
+            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
+        }),
+        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+        stop_str=["<|im_end|>"],
+        image_size_factors=[(0.10, 0.15)],
+        max_tokens=64,
+        marks=[large_gpu_mark(min_gb=64)],
+    ),
     "blip2": VLMTestInfo(
         models=["Salesforce/blip2-opt-2.7b"],
         test_type=VLMTestType.IMAGE,
@@ -352,6 +352,7 @@
         prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
         num_video_frames=16,
         max_model_len=4096,
+        max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
     ),
@@ -384,25 +385,49 @@
     ),
     "minicpmo_26": VLMTestInfo(
         models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+    ),
+    "minicpmo_26_multi_image": VLMTestInfo(
+        models=["openbmb/MiniCPM-o-2_6"],
+        test_type=(VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+    ),
+    "minicpmv_26_multi_image": VLMTestInfo(
+        models=["openbmb/MiniCPM-V-2_6"],
+        test_type=(VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],

@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import copy
 from functools import partial
 from typing import Optional, Union
 
@@ -29,7 +28,7 @@ def _test_processing_correctness(
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_available_online(on_fail="skip")
@@ -145,7 +144,7 @@ def _test_processing_correctness_hf(
     baseline_processor: BaseMultiModalProcessor,
     cached_processor: BaseMultiModalProcessor,
     batch_idx: int,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
     if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
         # For some multimodal models, tokenizer will always add bos_token
@@ -167,35 +166,38 @@ def _test_processing_correctness_hf(
         hf_processor_mm_kwargs={},
     )
 
-    assert _inputs_equal(
+    _assert_inputs_equal(
         baseline_result,
         cached_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )
 
     baseline_tokenized_result = baseline_processor.apply(
         token_prompt,
         mm_data=mm_data,
         hf_processor_mm_kwargs={},
     )
 
-    assert _inputs_equal(
+    _assert_inputs_equal(
         baseline_result,
         baseline_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )
 
     cached_tokenized_result = cached_processor.apply(
         token_prompt,
         mm_data=mm_data,
         hf_processor_mm_kwargs={},
     )
 
-    assert _inputs_equal(
+    _assert_inputs_equal(
         cached_result,
         cached_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )
 
 
 def _test_processing_correctness_mistral(
@@ -206,7 +208,7 @@ def _test_processing_correctness_mistral(
     baseline_processor: BaseMultiModalProcessor,
     cached_processor: BaseMultiModalProcessor,
     batch_idx: int,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
     images = mm_data.get("image", [])
     if not isinstance(images, list):
@@ -233,11 +235,12 @@ def _test_processing_correctness_mistral(
         hf_processor_mm_kwargs={},
     )
 
-    assert _inputs_equal(
+    _assert_inputs_equal(
         baseline_tokenized_result,
         cached_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )
 
 
 # yapf: disable
@@ -261,6 +264,7 @@ def _test_processing_correctness_mistral(
     "TIGER-Lab/Mantis-8B-siglip-llama3",
     "mistralai/Pixtral-12B-2409",
     "mistral-community/pixtral-12b",
+    "openbmb/MiniCPM-Llama3-V-2_5",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
     "allenai/Molmo-7B-D-0924",
@@ -290,7 +294,7 @@ def test_processing_correctness(
         # In Ultravox, the audio_features can be different depending on padding
         # The slight difference should not be a problem though, since
         # attention_mask lets us ignore the difference.
-        ignore_mm_keys = ['audio_features']
+        ignore_mm_keys = {"audio_features"}
 
     _test_processing_correctness(
         model_id,
@@ -328,38 +332,26 @@ def test_processing_correctness_phi3v(
     )
 
 
-def _inputs_equal(
+def _assert_inputs_equal(
     a: MultiModalInputs,
     b: MultiModalInputs,
-    ignore_mm_keys: Optional[list[str]] = None,
+    *,
+    ignore_mm_keys: Optional[set[str]] = None,
+    msg: str = "",
 ):
-    return _drop_mm_kwargs_keys(a, ignore_mm_keys) == _drop_mm_kwargs_keys(
-        b, ignore_mm_keys)
-
-
-def _drop_mm_kwargs_keys(
-    result: MultiModalInputs,
-    ignore_mm_keys: Optional[list[str]] = None,
-) -> MultiModalInputs:
-    """Drop specified keys from result['mm_kwargs'].
-
-    This is mainly to avoid doing exact match of audio_features in ultravox.
-
-    Args:
-        result: Result to drop keys from
-        ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
-    """
-    if not ignore_mm_keys:
-        return result
-
-    if 'mm_kwargs' in result:
-        result = copy.deepcopy(result)
-        mm_kwargs = result['mm_kwargs']
-        for key in ignore_mm_keys:
-            mm_kwargs.pop(key, None)
-        for items in mm_kwargs._items_by_modality.values():
-            for item in items:
-                for key in ignore_mm_keys:
-                    item.pop(key, None)
-
-    return result
+    if ignore_mm_keys is None:
+        ignore_mm_keys = set()
+
+    if msg is None:
+        assert "mm_kwargs" in a and "mm_kwargs" in b
+    else:
+        assert "mm_kwargs" in a and "mm_kwargs" in b, msg
+
+    for key in ignore_mm_keys:
+        a["mm_kwargs"].pop(key, None)
+        b["mm_kwargs"].pop(key, None)
+
+    if msg is None:
+        assert a == b
+    else:
+        assert a == b, msg
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
@@ -295,8 +295,6 @@ def _call_hf_processor(
 
         # HF processor pops the `num_crops` kwarg, which is needed by vLLM
         if (images := mm_data.get("images")) is not None:
-            assert isinstance(images, list)
-
             parsed_images = (self._get_data_parser().parse_mm_data({
                 "image":
                 images