Skip to content

Commit 82de9b9

Browse files
[Misc] Automatically resolve HF processor init kwargs (#22005)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent ad57f23 commit 82de9b9

40 files changed

+332
-725
lines changed

examples/offline_inference/vision_language.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -449,25 +449,6 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
449449
)
450450

451451

452-
# omni-research/Tarsier-7b
453-
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
454-
assert modality == "image"
455-
model_name = "omni-research/Tarsier-7b"
456-
457-
engine_args = EngineArgs(
458-
model=model_name,
459-
trust_remote_code=True,
460-
max_model_len=4096,
461-
limit_mm_per_prompt={modality: 1},
462-
)
463-
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
464-
465-
return ModelRequestData(
466-
engine_args=engine_args,
467-
prompts=prompts,
468-
)
469-
470-
471452
# Intern-S1
472453
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
473454
model_name = "internlm/Intern-S1"
@@ -1293,6 +1274,25 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
12931274
)
12941275

12951276

1277+
# omni-research/Tarsier-7b
1278+
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
1279+
assert modality == "image"
1280+
model_name = "omni-research/Tarsier-7b"
1281+
1282+
engine_args = EngineArgs(
1283+
model=model_name,
1284+
trust_remote_code=True,
1285+
max_model_len=4096,
1286+
limit_mm_per_prompt={modality: 1},
1287+
)
1288+
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
1289+
1290+
return ModelRequestData(
1291+
engine_args=engine_args,
1292+
prompts=prompts,
1293+
)
1294+
1295+
12961296
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
12971297
model_name = "omni-research/Tarsier2-Recap-7b"
12981298

tests/lora/test_qwen2vl.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
from typing import Optional
55

66
import pytest
7-
from packaging.version import Version
8-
from transformers import __version__ as TRANSFORMERS_VERSION
97

108
import vllm
119
from vllm.assets.image import ImageAsset
@@ -185,10 +183,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
185183
current_platform.is_rocm(),
186184
reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
187185
)
188-
@pytest.mark.skipif(
189-
Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
190-
reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
191-
)
192186
def test_qwen25vl_lora(qwen25vl_lora_files):
193187
"""Test Qwen 2.5 VL model with LoRA"""
194188
config = TestConfig(model_path=QWEN25VL_MODEL_PATH,

tests/models/multimodal/generation/test_common.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -702,13 +702,38 @@
702702
"smolvlm": VLMTestInfo(
703703
models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
704704
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
705-
prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
705+
prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
706706
img_idx_to_prompt=lambda idx: "<image>",
707707
max_model_len=8192,
708708
max_num_seqs=2,
709709
auto_cls=AutoModelForImageTextToText,
710710
hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
711711
),
712+
"tarsier": VLMTestInfo(
713+
models=["omni-research/Tarsier-7b"],
714+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
715+
prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
716+
max_model_len=4096,
717+
max_num_seqs=2,
718+
auto_cls=AutoModelForImageTextToText,
719+
patch_hf_runner=model_utils.tarsier_patch_hf_runner,
720+
),
721+
"tarsier2": VLMTestInfo(
722+
models=["omni-research/Tarsier2-Recap-7b"],
723+
test_type=(
724+
VLMTestType.IMAGE,
725+
VLMTestType.MULTI_IMAGE,
726+
VLMTestType.VIDEO,
727+
),
728+
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
729+
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
730+
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
731+
max_model_len=4096,
732+
max_num_seqs=2,
733+
auto_cls=AutoModelForImageTextToText,
734+
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
735+
marks=[pytest.mark.skip("Model initialization hangs")],
736+
),
712737
### Tensor parallel / multi-gpu broadcast tests
713738
"chameleon-broadcast": VLMTestInfo(
714739
models=["facebook/chameleon-7b"],

tests/models/multimodal/generation/vlm_utils/model_utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,3 +818,15 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
818818
thinker.get_output_embeddings = lambda: thinker.lm_head
819819
hf_model.model = thinker
820820
return hf_model
821+
822+
823+
def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
824+
from vllm.model_executor.models.tarsier import get_vision_encoder_info
825+
826+
vision_encoder_info = get_vision_encoder_info(hf_model.config)
827+
828+
hf_processor = hf_model.processor
829+
if hf_processor.patch_size is None:
830+
hf_processor.patch_size = vision_encoder_info.get_patch_size()
831+
832+
return hf_model

tests/models/multimodal/processing/test_transformers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def test_multimodal_processor(model_id):
1616
model_impl="transformers",
1717
)
1818

19-
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
19+
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
2020

2121
image_pil = ImageAsset('cherry_blossom').pil_image
2222
mm_data = {"image": image_pil}

tests/models/registry.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,8 +465,7 @@ def check_available_online(
465465
is_available_online=False),
466466
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
467467
trust_remote_code=True),
468-
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
469-
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
468+
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501
470469
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
471470
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
472471
"VoxtralForConditionalGeneration": _HfExamplesInfo(

tests/multimodal/test_processing.py

Lines changed: 70 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,15 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
from contextlib import nullcontext
5-
from types import MethodType
6-
from typing import cast
5+
from typing import Optional, cast
76
from unittest.mock import MagicMock
87

98
import numpy as np
109
import pytest
1110
import torch
12-
from transformers import ProcessorMixin
1311

1412
from vllm.config import ModelConfig
13+
from vllm.inputs import InputProcessingContext
1514
from vllm.multimodal import MULTIMODAL_REGISTRY
1615
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
1716
MultiModalKwargsItem,
@@ -1013,57 +1012,91 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
10131012
)
10141013

10151014

1016-
class _ProcessorProxy:
1015+
class DummyProcessor:
10171016

1018-
def __init__(self, processor: ProcessorMixin) -> None:
1017+
def __init__(self, a: int = 0, b: int = 0) -> None:
10191018
super().__init__()
10201019

1021-
self.__processor = processor
1022-
1023-
def __getattr__(self, key: str):
1024-
return getattr(self.__processor, key)
1020+
self.a = a
1021+
self.b = b
10251022

10261023
def __call__(
10271024
self,
1028-
text=None,
1029-
images=None,
1030-
videos=None,
1031-
exists=None,
1032-
return_tensors=None,
1033-
):
1034-
return dict(exists=exists)
1025+
a: int = 0,
1026+
c: int = 0,
1027+
return_tensors: Optional[str] = None,
1028+
) -> dict[str, int]:
1029+
return dict(a=a, c=c)
10351030

10361031

1037-
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
10381032
# yapf: disable
1033+
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
10391034
@pytest.mark.parametrize(
1040-
("call_kwargs", "expected_kwargs"),
1035+
("config_kwargs", "inference_kwargs", "expected_kwargs"),
10411036
[
1042-
# Should ignore invalid kwargs
1043-
({"does_not_exist": 100}, {"exists": None}),
1044-
({"exists": 1}, {"exists": 1}),
1045-
({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
1037+
({"a": 1}, {}, {"a": 1, "b": 0}),
1038+
({}, {"a": 1}, {"a": 1, "b": 0}),
1039+
# inference_kwargs should take precedence
1040+
({"a": 1}, {"a": 2}, {"a": 2, "b": 0}),
1041+
# Should ignore extra kwargs
1042+
({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}),
1043+
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
10461044
],
10471045
)
10481046
# yapf: enable
1049-
def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
1050-
model_config = ModelConfig(model_id)
1047+
def test_hf_processor_init_kwargs(
1048+
model_id,
1049+
config_kwargs,
1050+
inference_kwargs,
1051+
expected_kwargs,
1052+
):
1053+
# Should not be used since there is nothing to convert to tokens
1054+
mock_tokenizer = cast(AnyTokenizer, object())
10511055

1052-
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
1053-
orig_get_hf_processor = processor.info.get_hf_processor
1056+
ctx = InputProcessingContext(
1057+
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
1058+
tokenizer=mock_tokenizer,
1059+
)
1060+
1061+
processor = ctx.get_hf_processor(
1062+
DummyProcessor, # type: ignore[arg-type]
1063+
**inference_kwargs,
1064+
)
1065+
1066+
for k, v in expected_kwargs.items():
1067+
assert getattr(processor, k) == v
10541068

1055-
def get_hf_processor(self, **kwargs):
1056-
assert kwargs == call_kwargs
1057-
return _ProcessorProxy(orig_get_hf_processor())
10581069

1059-
processor.info.get_hf_processor = MethodType(get_hf_processor,
1060-
processor.info)
1070+
# yapf: disable
1071+
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
1072+
@pytest.mark.parametrize(
1073+
("config_kwargs", "inference_kwargs", "expected_kwargs"),
1074+
[
1075+
({"a": 1}, {}, {"a": 1, "c": 0}),
1076+
({}, {"a": 1}, {"a": 1, "c": 0}),
1077+
# inference_kwargs should take precedence
1078+
({"a": 1}, {"a": 2}, {"a": 2, "c": 0}),
1079+
# Should ignore extra kwargs
1080+
({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}),
1081+
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
1082+
],
1083+
)
1084+
# yapf: enable
1085+
def test_hf_processor_call_kwargs(
1086+
model_id,
1087+
config_kwargs,
1088+
inference_kwargs,
1089+
expected_kwargs,
1090+
):
1091+
# Should not be used since there is nothing to convert to tokens
1092+
mock_tokenizer = cast(AnyTokenizer, object())
10611093

1062-
out_kwargs = processor._call_hf_processor(
1063-
prompt="",
1064-
mm_data={},
1065-
mm_kwargs=call_kwargs,
1066-
tok_kwargs={},
1094+
ctx = InputProcessingContext(
1095+
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
1096+
tokenizer=mock_tokenizer,
10671097
)
10681098

1069-
assert out_kwargs == expected_kwargs
1099+
processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type]
1100+
1101+
result = ctx.call_hf_processor(processor, {}, inference_kwargs)
1102+
assert result == expected_kwargs

vllm/config.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import uuid
1212
import warnings
1313
from collections import Counter
14+
from collections.abc import Mapping
1415
from contextlib import contextmanager
1516
from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
1617
replace)
@@ -3332,7 +3333,16 @@ def get_limit_per_prompt(self, modality: str) -> int:
33323333
999 if envs.VLLM_USE_V1 else 1,
33333334
)
33343335

3335-
# TODO: Add configs to init vision tower or not.
3336+
def merge_mm_processor_kwargs(
3337+
self,
3338+
inference_kwargs: Mapping[str, object],
3339+
) -> dict[str, object]:
3340+
"""
3341+
Get the keyword arguments to pass to the multi-modal processor
3342+
according to the extra arguments passed during inference.
3343+
"""
3344+
kwargs = self.mm_processor_kwargs or {}
3345+
return kwargs | dict(inference_kwargs)
33363346

33373347

33383348
@config

vllm/inputs/registry.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from vllm.jsontree import JSONTree, json_map_leaves
1212
from vllm.logger import init_logger
1313
from vllm.transformers_utils.processor import cached_processor_from_config
14-
from vllm.utils import resolve_mm_processor_kwargs
14+
from vllm.utils import get_allowed_kwarg_only_overrides
1515

1616
if TYPE_CHECKING:
1717
from vllm.config import ModelConfig
@@ -154,14 +154,11 @@ def call_hf_processor(
154154
assert callable(hf_processor)
155155

156156
mm_config = self.model_config.get_multimodal_config()
157-
base_kwargs = mm_config.mm_processor_kwargs
158-
if base_kwargs is None:
159-
base_kwargs = {}
157+
merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
160158

161-
merged_kwargs = resolve_mm_processor_kwargs(
162-
base_kwargs,
163-
kwargs,
159+
allowed_kwargs = get_allowed_kwarg_only_overrides(
164160
hf_processor,
161+
merged_kwargs,
165162
requires_kw_only=False,
166163
allow_var_kwargs=True,
167164
)
@@ -173,7 +170,9 @@ def maybe_cast_dtype(x):
173170
return x
174171

175172
try:
176-
output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
173+
output = hf_processor(**data,
174+
**allowed_kwargs,
175+
return_tensors="pt")
177176
# this emulates output.to(dtype=self.model_config.dtype)
178177
if isinstance(output, BatchFeature):
179178
cast_output = json_map_leaves(maybe_cast_dtype, output.data)
@@ -189,7 +188,7 @@ def maybe_cast_dtype(x):
189188

190189
except Exception as exc:
191190
msg = (f"Failed to apply {type(hf_processor).__name__} "
192-
f"on data={data} with kwargs={merged_kwargs}")
191+
f"on data={data} with kwargs={allowed_kwargs}")
193192

194193
raise ValueError(msg) from exc
195194

vllm/model_executor/models/aya_vision.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -123,16 +123,10 @@ def get_hf_config(self) -> AyaVisionConfig:
123123
return self.ctx.get_hf_config(AyaVisionConfig)
124124

125125
def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
126-
processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
126+
return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
127127

128-
# Temporary workaround since this processor has multiple image tokens
129-
# See https://github.com/huggingface/transformers/issues/38350
130-
processor._check_special_mm_tokens = lambda *args, **kwargs: None
131-
132-
return processor
133-
134-
def get_image_processor(self) -> GotOcr2ImageProcessor:
135-
return self.get_hf_processor().image_processor
128+
def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
129+
return self.get_hf_processor(**kwargs).image_processor
136130

137131
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
138132
return {"image": None}

0 commit comments

Comments
 (0)