vllm-project
diff --git a/‎examples/offline_inference/vision_language.py‎
Lines changed: 19 additions & 19 deletions b/‎examples/offline_inference/vision_language.py‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎tests/lora/test_qwen2vl.py‎
Lines changed: 0 additions & 6 deletions b/‎tests/lora/test_qwen2vl.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎tests/models/multimodal/generation/test_common.py‎
Lines changed: 26 additions & 1 deletion b/‎tests/models/multimodal/generation/test_common.py‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎tests/models/multimodal/generation/vlm_utils/model_utils.py‎
Lines changed: 12 additions & 0 deletions b/‎tests/models/multimodal/generation/vlm_utils/model_utils.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tests/models/multimodal/processing/test_transformers.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/multimodal/processing/test_transformers.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/registry.py‎
Lines changed: 1 addition & 2 deletions b/‎tests/models/registry.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tests/multimodal/test_processing.py‎
Lines changed: 70 additions & 37 deletions b/‎tests/multimodal/test_processing.py‎
Lines changed: 70 additions & 37 deletions
diff --git a/‎vllm/config.py‎
Lines changed: 11 additions & 1 deletion b/‎vllm/config.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎vllm/inputs/registry.py‎
Lines changed: 8 additions & 9 deletions b/‎vllm/inputs/registry.py‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎vllm/model_executor/models/aya_vision.py‎
Lines changed: 3 additions & 9 deletions b/‎vllm/model_executor/models/aya_vision.py‎
Lines changed: 3 additions & 9 deletions
@@ -449,25 +449,6 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# omni-research/Tarsier-7b
-def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-    model_name = "omni-research/Tarsier-7b"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=4096,
-        limit_mm_per_prompt={modality: 1},
-    )
-    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "internlm/Intern-S1"
@@ -1293,6 +1274,25 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
     )
 
 
+# omni-research/Tarsier-7b
+def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "omni-research/Tarsier2-Recap-7b"
 
 
@@ -4,8 +4,6 @@
 from typing import Optional
 
 import pytest
-from packaging.version import Version
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 import vllm
 from vllm.assets.image import ImageAsset
@@ -185,10 +183,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     current_platform.is_rocm(),
     reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
 )
-@pytest.mark.skipif(
-    Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
-    reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
-)
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
     config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
 
@@ -702,13 +702,38 @@
     "smolvlm": VLMTestInfo(
         models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
     ),
+    "tarsier": VLMTestInfo(
+        models=["omni-research/Tarsier-7b"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        patch_hf_runner=model_utils.tarsier_patch_hf_runner,
+    ),
+    "tarsier2": VLMTestInfo(
+        models=["omni-research/Tarsier2-Recap-7b"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO,
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.skip("Model initialization hangs")],
+    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
 
@@ -818,3 +818,15 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     thinker.get_output_embeddings = lambda: thinker.lm_head
     hf_model.model = thinker
     return hf_model
+
+
+def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    from vllm.model_executor.models.tarsier import get_vision_encoder_info
+
+    vision_encoder_info = get_vision_encoder_info(hf_model.config)
+
+    hf_processor = hf_model.processor
+    if hf_processor.patch_size is None:
+        hf_processor.patch_size = vision_encoder_info.get_patch_size()
+
+    return hf_model
@@ -16,7 +16,7 @@ def test_multimodal_processor(model_id):
         model_impl="transformers",
     )
 
-    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     image_pil = ImageAsset('cherry_blossom').pil_image
     mm_data = {"image": image_pil}
 
@@ -465,8 +465,7 @@ def check_available_online(
                                                         is_available_online=False),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
-    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b",  # noqa: E501
-                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
+    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),  # noqa: E501
     "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b",  # noqa: E501
                                                         hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}),  # noqa: E501
     "VoxtralForConditionalGeneration": _HfExamplesInfo(
 
@@ -2,16 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import nullcontext
-from types import MethodType
-from typing import cast
+from typing import Optional, cast
 from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
 import torch
-from transformers import ProcessorMixin
 
 from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
                                     MultiModalKwargsItem,
@@ -1013,57 +1012,91 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         )
 
 
-class _ProcessorProxy:
+class DummyProcessor:
 
-    def __init__(self, processor: ProcessorMixin) -> None:
+    def __init__(self, a: int = 0, b: int = 0) -> None:
         super().__init__()
 
-        self.__processor = processor
-
-    def __getattr__(self, key: str):
-        return getattr(self.__processor, key)
+        self.a = a
+        self.b = b
 
     def __call__(
         self,
-        text=None,
-        images=None,
-        videos=None,
-        exists=None,
-        return_tensors=None,
-    ):
-        return dict(exists=exists)
+        a: int = 0,
+        c: int = 0,
+        return_tensors: Optional[str] = None,
+    ) -> dict[str, int]:
+        return dict(a=a, c=c)
 
 
-@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 # yapf: disable
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 @pytest.mark.parametrize(
-    ("call_kwargs", "expected_kwargs"),
+    ("config_kwargs", "inference_kwargs", "expected_kwargs"),
     [
-        # Should ignore invalid kwargs
-        ({"does_not_exist": 100}, {"exists": None}),
-        ({"exists": 1}, {"exists": 1}),
-        ({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
+        ({"a": 1}, {}, {"a": 1, "b": 0}),
+        ({}, {"a": 1}, {"a": 1, "b": 0}),
+        # inference_kwargs should take precedence
+        ({"a": 1}, {"a": 2}, {"a": 2, "b": 0}),
+        # Should ignore extra kwargs
+        ({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}),
+        ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
     ],
 )
 # yapf: enable
-def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
-    model_config = ModelConfig(model_id)
+def test_hf_processor_init_kwargs(
+    model_id,
+    config_kwargs,
+    inference_kwargs,
+    expected_kwargs,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
 
-    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
-    orig_get_hf_processor = processor.info.get_hf_processor
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=mock_tokenizer,
+    )
+
+    processor = ctx.get_hf_processor(
+        DummyProcessor,  # type: ignore[arg-type]
+        **inference_kwargs,
+    )
+
+    for k, v in expected_kwargs.items():
+        assert getattr(processor, k) == v
 
-    def get_hf_processor(self, **kwargs):
-        assert kwargs == call_kwargs
-        return _ProcessorProxy(orig_get_hf_processor())
 
-    processor.info.get_hf_processor = MethodType(get_hf_processor,
-                                                 processor.info)
+# yapf: disable
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
+@pytest.mark.parametrize(
+    ("config_kwargs", "inference_kwargs", "expected_kwargs"),
+    [
+        ({"a": 1}, {}, {"a": 1, "c": 0}),
+        ({}, {"a": 1}, {"a": 1, "c": 0}),
+        # inference_kwargs should take precedence
+        ({"a": 1}, {"a": 2}, {"a": 2, "c": 0}),
+        # Should ignore extra kwargs
+        ({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}),
+        ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
+    ],
+)
+# yapf: enable
+def test_hf_processor_call_kwargs(
+    model_id,
+    config_kwargs,
+    inference_kwargs,
+    expected_kwargs,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
 
-    out_kwargs = processor._call_hf_processor(
-        prompt="",
-        mm_data={},
-        mm_kwargs=call_kwargs,
-        tok_kwargs={},
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=mock_tokenizer,
     )
 
-    assert out_kwargs == expected_kwargs
+    processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
+
+    result = ctx.call_hf_processor(processor, {}, inference_kwargs)
+    assert result == expected_kwargs
@@ -11,6 +11,7 @@
 import uuid
 import warnings
 from collections import Counter
+from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
                          replace)
@@ -3332,7 +3333,16 @@ def get_limit_per_prompt(self, modality: str) -> int:
             999 if envs.VLLM_USE_V1 else 1,
         )
 
-    # TODO: Add configs to init vision tower or not.
+    def merge_mm_processor_kwargs(
+        self,
+        inference_kwargs: Mapping[str, object],
+    ) -> dict[str, object]:
+        """
+        Get the keyword arguments to pass to the multi-modal processor
+        according to the extra arguments passed during inference.
+        """
+        kwargs = self.mm_processor_kwargs or {}
+        return kwargs | dict(inference_kwargs)
 
 
 @config
 
@@ -11,7 +11,7 @@
 from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.utils import resolve_mm_processor_kwargs
+from vllm.utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -154,14 +154,11 @@ def call_hf_processor(
         assert callable(hf_processor)
 
         mm_config = self.model_config.get_multimodal_config()
-        base_kwargs = mm_config.mm_processor_kwargs
-        if base_kwargs is None:
-            base_kwargs = {}
+        merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
 
-        merged_kwargs = resolve_mm_processor_kwargs(
-            base_kwargs,
-            kwargs,
+        allowed_kwargs = get_allowed_kwarg_only_overrides(
             hf_processor,
+            merged_kwargs,
             requires_kw_only=False,
             allow_var_kwargs=True,
         )
@@ -173,7 +170,9 @@ def maybe_cast_dtype(x):
             return x
 
         try:
-            output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            output = hf_processor(**data,
+                                  **allowed_kwargs,
+                                  return_tensors="pt")
             # this emulates output.to(dtype=self.model_config.dtype)
             if isinstance(output, BatchFeature):
                 cast_output = json_map_leaves(maybe_cast_dtype, output.data)
@@ -189,7 +188,7 @@ def maybe_cast_dtype(x):
 
         except Exception as exc:
             msg = (f"Failed to apply {type(hf_processor).__name__} "
-                   f"on data={data} with kwargs={merged_kwargs}")
+                   f"on data={data} with kwargs={allowed_kwargs}")
 
             raise ValueError(msg) from exc
 
 
@@ -123,16 +123,10 @@ def get_hf_config(self) -> AyaVisionConfig:
         return self.ctx.get_hf_config(AyaVisionConfig)
 
     def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
-        processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
 
-        # Temporary workaround since this processor has multiple image tokens
-        # See https://github.com/huggingface/transformers/issues/38350
-        processor._check_special_mm_tokens = lambda *args, **kwargs: None
-
-        return processor
-
-    def get_image_processor(self) -> GotOcr2ImageProcessor:
-        return self.get_hf_processor().image_processor
+    def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def test_multimodal_processor(model_id):`
`16`	`16`	`model_impl="transformers",`
`17`	`17`	`)`
`18`	`18`
`19`		`- mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )`
	`19`	`+ mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)`
`20`	`20`
`21`	`21`	`image_pil = ImageAsset('cherry_blossom').pil_image`
`22`	`22`	`mm_data = {"image": image_pil}`