diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index b2821966cf123..5dc6a936d1c1b 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -85,6 +85,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: trust_remote_code=True, max_model_len=8192, limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={"max_dynamic_patch": 4}, ) placeholders = "\n".join(f"Image-{i}: \n" diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 88dcc32f44f52..331ffe82ec85d 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -10,7 +10,7 @@ from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.processing import ProcessingCache -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from ....multimodal.utils import random_audio, random_image, random_video from ...registry import HF_EXAMPLE_MODELS @@ -42,10 +42,7 @@ def _test_processing_correctness( factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] ctx = InputProcessingContext( model_config, - tokenizer=cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_info.trust_remote_code, - ), + tokenizer=cached_tokenizer_from_config(model_config), ) # Ensure that it can fit all of the data cache = ProcessingCache(capacity=1 << 30) diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 767ac5eb9ef9a..5c43e4eed7878 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -1,17 +1,118 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for H2OVL's multimodal preprocessing kwargs.""" -from typing import Optional +from typing import Mapping, Optional import pytest +from PIL import Image +from transformers import PretrainedConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import rescale_image_size -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from ....conftest import _ImageAssets from ...utils import build_model_context +def _get_expected_num_patches( + config: PretrainedConfig, + image: Image.Image, + num_imgs: int, + min_num: int, + max_num: int, +): + from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets, + get_h2ovl_target_ratios) + + width, height = image.size + + # Calculate the expected number of blocks + if num_imgs == 1 and config.use_msac: + # First pass + blocks1, _, _, aspect_ratio = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num=1, + max_num=max_num, + prior_aspect_ratio=None, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, # Thumbnail is handled separately + ) + + # Second pass + blocks2, _, _, _ = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num=3, + max_num=max_num, + prior_aspect_ratio=aspect_ratio, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, + ) + + # Add thumbnail if use_thumbnail is True and total_blocks > 1 + if config.use_thumbnail: + blocks1 += 1 if blocks1 > 1 else 0 + blocks2 += 1 if blocks2 > 1 else 0 + + # Total blocks is the sum of blocks from both passes minus + # overlapping + total_blocks = blocks1 + blocks2 - 1 + + return total_blocks + + blocks, _, _, _ = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=None, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, + ) + expected_num_patches = blocks + + if config.use_thumbnail and expected_num_patches > 1: + expected_num_patches += 1 + + return expected_num_patches + + +def _run_check( + processor: BaseMultiModalProcessor, + images: list[Image.Image], + min_num: int, + max_num: int, + mm_processor_kwargs: Mapping[str, object], +): + tokenizer = processor.info.get_tokenizer() + config = processor.info.get_hf_config() + + mm_data = {"image": images} + + total_expected_num_patches = sum( + _get_expected_num_patches(config, image, len(images), min_num, max_num) + for image in images) + + processed_inputs = processor.apply("" * len(images), mm_data, + mm_processor_kwargs) + + # Ensure we have the right number of placeholders per num_crops size + image_token_id = tokenizer.convert_tokens_to_ids("") + img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) + pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + + assert img_tok_count == 256 * total_expected_num_patches + assert pixel_shape[0] == total_expected_num_patches + + @pytest.mark.parametrize("model_id", [ "h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-2b", @@ -25,118 +126,54 @@ [1.0, 1.0, 1.0], # Multi-scale [0.25, 0.5, 1.0], + [4.0, 2.0, 1.0], ], ) -@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8]) +@pytest.mark.parametrize( + ("min_dynamic_patch", "max_dynamic_patch"), + [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)], +) @pytest.mark.parametrize("dynamic_image_size", [True, False]) -@pytest.mark.parametrize("num_imgs", [1, 2]) +@pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( model_id: str, image_assets: _ImageAssets, size_factors: list[int], + min_dynamic_patch: int, max_dynamic_patch: int, dynamic_image_size: Optional[bool], - num_imgs: int, + kwargs_on_init: bool, ): - from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets, - get_h2ovl_target_ratios) + mm_processor_kwargs = { + "min_dynamic_patch": min_dynamic_patch, + "max_dynamic_patch": max_dynamic_patch, + "dynamic_image_size": dynamic_image_size, + } ctx = build_model_context( model_name=model_id, tokenizer_name=model_id, trust_remote_code=True, - mm_processor_kwargs=None, - limit_mm_per_prompt={"image": num_imgs}, - ) - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, + mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, + limit_mm_per_prompt={"image": len(size_factors)}, ) + tokenizer = cached_tokenizer_from_config(ctx.model_config) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, ) + hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs - config = processor.info.get_hf_config() - use_msac = config.use_msac - - mm_processor_kwargs = { - "max_dynamic_patch": max_dynamic_patch, - } - if dynamic_image_size is not None: - mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size - - min_num = config.min_dynamic_patch + min_num = min_dynamic_patch if dynamic_image_size else 1 max_num = max_dynamic_patch if dynamic_image_size else 1 - # Build the image str / prompt based on the number of images we pass - prompt = "" * num_imgs - - for asset in image_assets: - for factor in size_factors: - image = rescale_image_size(asset.pil_image, factor) - mm_data = {"image": [image] * num_imgs} - - width, height = image.size - - # Calculate the expected number of blocks - if num_imgs == 1 and use_msac: - # First pass - blocks1, _, _, aspect_ratio = calculate_h2ovl_targets( - orig_width=width, - orig_height=height, - target_ratios=get_h2ovl_target_ratios( - min_num, - max_num, - prior_aspect_ratio=None, - ), - image_size=config.vision_config.image_size, - use_thumbnail=False, # Thumbnail is handled separately - ) - - # Second pass - blocks2, _, _, _ = calculate_h2ovl_targets( - orig_width=width, - orig_height=height, - target_ratios=get_h2ovl_target_ratios( - min_num, - max_num, - prior_aspect_ratio=aspect_ratio, - ), - image_size=config.vision_config.image_size, - use_thumbnail=False, - ) - - # Add thumbnail if use_thumbnail is True and total_blocks > 1 - if config.use_thumbnail: - blocks1 += 1 if blocks1 > 1 else 0 - blocks2 += 1 if blocks2 > 1 else 0 - - # Total blocks is the sum of blocks from both passes minus - # overlapping - total_blocks = blocks1 + blocks2 - 1 - - expected_num_patches = total_blocks - else: - blocks, _, _, _ = calculate_h2ovl_targets( - orig_width=width, - orig_height=height, - target_ratios=get_h2ovl_target_ratios( - min_num, - max_num, - prior_aspect_ratio=None, - ), - image_size=config.vision_config.image_size, - use_thumbnail=False, - ) - expected_num_patches = blocks - - if config.use_thumbnail and expected_num_patches != 1: - expected_num_patches += 1 - - processed_inputs = processor.apply(prompt, mm_data, - mm_processor_kwargs) - pixel_shape = ( - processed_inputs["mm_kwargs"]["pixel_values_flat"].shape) - - assert pixel_shape[0] == expected_num_patches * num_imgs + _run_check( + processor, + [ + rescale_image_size(image_assets[0].pil_image, f) + for f in size_factors + ], + min_num, + max_num, + hf_processor_mm_kwargs, + ) diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index 07ab1bbd4b5ea..0a0f1cb389380 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -4,7 +4,7 @@ from transformers import Idefics3Config from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from ....conftest import _ImageAssets from ...utils import build_model_context @@ -22,9 +22,15 @@ ]) # yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_override(image_assets: _ImageAssets, model: str, - mm_processor_kwargs: dict[str, object], - expected_toks_per_img: int, num_imgs: int): +@pytest.mark.parametrize("kwargs_on_init", [True, False]) +def test_processor_override( + image_assets: _ImageAssets, + model: str, + mm_processor_kwargs: dict[str, object], + expected_toks_per_img: int, + num_imgs: int, + kwargs_on_init: bool, +): """Ensure input_processor_for_idefics3 handles num_crops properly.""" # Same as the previous test - don't initialize mm_processor_kwargs # in this test and assume that the kwargs will be correctly expanded by @@ -33,15 +39,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str, model_name=model, tokenizer_name=model, trust_remote_code=True, - mm_processor_kwargs=None, + mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + tokenizer = cached_tokenizer_from_config(ctx.model_config) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, ) - hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) + hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass placeholders = "" if num_imgs == 1 else "\n".join( @@ -54,8 +60,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str, dummy_image = image_assets[0].pil_image.resize(dummy_image_size) mm_data = {"image": [dummy_image] * num_imgs} - processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) + # Ensure the placeholders format are correct + hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"]) assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[ "input_ids"][0] diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index ede961225be7b..cc777fdf57b3c 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -1,64 +1,136 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for InternVL's multimodal preprocessing kwargs.""" -from typing import Optional +from typing import Mapping, Optional import pytest +from PIL import Image +from transformers import PretrainedConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from ....conftest import _ImageAssets from ...utils import build_model_context +def _get_expected_num_patches( + config: PretrainedConfig, + image: Image.Image, + num_imgs: int, + min_num: int, + max_num: int, +): + from vllm.model_executor.models.internvl import ( + calculate_internvl_targets, get_internvl_target_ratios) + + width, height = image.size + + blocks, _, _ = calculate_internvl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_internvl_target_ratios( + min_num, + max_num, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, + ) + expected_num_patches = blocks + + if config.use_thumbnail and expected_num_patches > 1: + expected_num_patches += 1 + + return expected_num_patches + + +def _run_check( + processor: BaseMultiModalProcessor, + images: list[Image.Image], + min_num: int, + max_num: int, + mm_processor_kwargs: Mapping[str, object], +): + tokenizer = processor.info.get_tokenizer() + config = processor.info.get_hf_config() + + mm_data = {"image": images} + + total_expected_num_patches = sum( + _get_expected_num_patches(config, image, len(images), min_num, max_num) + for image in images) + + processed_inputs = processor.apply("" * len(images), mm_data, + mm_processor_kwargs) + + # Ensure we have the right number of placeholders per num_crops size + image_token_id = tokenizer.convert_tokens_to_ids("") + img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) + pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + + assert img_tok_count == 256 * total_expected_num_patches + assert pixel_shape[0] == total_expected_num_patches + + @pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"]) -@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) -@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -@pytest.mark.parametrize("num_imgs", [1, 2]) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + [4.0, 2.0, 1.0], + ], +) +@pytest.mark.parametrize( + ("min_dynamic_patch", "max_dynamic_patch"), + [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)], +) +@pytest.mark.parametrize("dynamic_image_size", [True, False]) +@pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( model_id: str, image_assets: _ImageAssets, + size_factors: list[int], + min_dynamic_patch: int, max_dynamic_patch: int, dynamic_image_size: Optional[bool], - num_imgs: int, + kwargs_on_init: bool, ): + mm_processor_kwargs = { + "min_dynamic_patch": min_dynamic_patch, + "max_dynamic_patch": max_dynamic_patch, + "dynamic_image_size": dynamic_image_size, + } + ctx = build_model_context( model_name=model_id, tokenizer_name=model_id, trust_remote_code=True, - mm_processor_kwargs=None, - limit_mm_per_prompt={"image": num_imgs}, - ) - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, + mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, + limit_mm_per_prompt={"image": len(size_factors)}, ) + tokenizer = cached_tokenizer_from_config(ctx.model_config) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, ) + hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs - mm_processor_kwargs = { - "max_dynamic_patch": max_dynamic_patch, - } - if dynamic_image_size is not None: - mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size + min_num = min_dynamic_patch if dynamic_image_size else 1 + max_num = max_dynamic_patch if dynamic_image_size else 1 - # Build the image str / prompt based on the number of images we pass - prompt = "" * num_imgs - image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) - mm_data = {"image": [image] * num_imgs} - - expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 - if dynamic_image_size is False: - expected_num_patches = 1 - - processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) - - # Ensure we have the right number of placeholders per num_crops size - image_token_id = tokenizer.convert_tokens_to_ids("") - img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape - - assert img_tok_count == 256 * expected_num_patches * num_imgs - assert pixel_shape[0] == expected_num_patches * num_imgs + _run_check( + processor, + [ + rescale_image_size(image_assets[0].pil_image, f) + for f in size_factors + ], + min_num, + max_num, + hf_processor_mm_kwargs, + ) diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index fe4754c2ef6f6..dca25e5d4c4c6 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -10,7 +10,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.parse import ImageSize from vllm.multimodal.processing import BaseMultiModalProcessor -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from ...utils import build_model_context @@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, - ), + tokenizer=cached_tokenizer_from_config(ctx.model_config), ) info = processor.info @@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, - ), + tokenizer=cached_tokenizer_from_config(ctx.model_config), ) image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), @@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, - ), + tokenizer=cached_tokenizer_from_config(ctx.model_config), ) seen_aspect_ratios = set[float]() diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index fb650d9e0995f..96abc840f0521 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -10,7 +10,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.parse import ImageSize from vllm.multimodal.processing import BaseMultiModalProcessor -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from ...utils import build_model_context @@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, - ), + tokenizer=cached_tokenizer_from_config(ctx.model_config), ) info = processor.info @@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, - ), + tokenizer=cached_tokenizer_from_config(ctx.model_config), ) image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), @@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, - ), + tokenizer=cached_tokenizer_from_config(ctx.model_config), ) seen_aspect_ratios = set[float]() diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index dde8904f2ef65..420644f70842f 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -3,7 +3,7 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from ....conftest import _ImageAssets from ...utils import build_model_context @@ -21,12 +21,14 @@ ]) # yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) +@pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( image_assets: _ImageAssets, model_id: str, mm_processor_kwargs: dict[str, int], expected_toks_per_img: int, num_imgs: int, + kwargs_on_init: bool, ): """Ensure input_processor_for_phi3v handles num_crops properly.""" # Avoid initializing CUDA early @@ -36,23 +38,22 @@ def test_processor_override( model_name=model_id, tokenizer_name=model_id, trust_remote_code=True, + mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, - ) + tokenizer = cached_tokenizer_from_config(ctx.model_config) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, ) + hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" mm_data = {"image": [image_assets[0].pil_image] * num_imgs} - processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) # Ensure we have the right number of placeholders per num_crops size img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index ef8e97f82d0bc..b882528aafb9c 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -3,7 +3,7 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from ....conftest import _ImageAssets from ...utils import build_model_context @@ -18,6 +18,7 @@ ]) # yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) +@pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( image_assets: _ImageAssets, model_id: str, @@ -25,31 +26,30 @@ def test_processor_override( expected_toks_per_img: int, expected_pixels_shape: tuple[int, int], num_imgs: int, + kwargs_on_init: bool, ): """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" ctx = build_model_context( model_name=model_id, tokenizer_name=model_id, - mm_processor_kwargs=None, + mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code, - ) + tokenizer = cached_tokenizer_from_config(ctx.model_config) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, ) + hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs mm_data = {"image": [image_assets[0].pil_image] * num_imgs} - processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) # Ensure we have the right number of placeholders per num_crops size - hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) + hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape diff --git a/tests/models/utils.py b/tests/models/utils.py index e2be43c126671..a90efb1767220 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -248,13 +248,16 @@ def check_logprobs_close( warnings.warn(fail_msg, stacklevel=2) -def build_model_context(model_name: str, - task: TaskOption = "auto", - tokenizer_name: Optional[str] = None, - trust_remote_code: bool = False, - dtype: Optional[Union[str, torch.dtype]] = None, - mm_processor_kwargs: Optional[Dict] = None, - limit_mm_per_prompt: Optional[Dict] = None): +def build_model_context( + model_name: str, + task: TaskOption = "auto", + tokenizer_name: Optional[str] = None, + trust_remote_code: bool = False, + dtype: Optional[Union[str, torch.dtype]] = None, + mm_processor_kwargs: Optional[Dict] = None, + limit_mm_per_prompt: Optional[Dict] = None, + disable_mm_preprocessor_cache: bool = True, +): """Creates an InputContext for a given model. Args: @@ -283,5 +286,6 @@ def build_model_context(model_name: str, seed=0, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, + disable_mm_preprocessor_cache=disable_mm_preprocessor_cache, ) return InputContext(model_config) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 7bbe5c53562df..b247321ebb2fd 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -22,8 +22,8 @@ replace_token_matches) # yapf: enable from vllm.multimodal.profiling import MultiModalProfiler -from vllm.multimodal.utils import cached_get_tokenizer -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import (AnyTokenizer, + cached_tokenizer_from_config) from vllm.utils import full_groupby from .utils import random_image @@ -577,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): processor = MULTIMODAL_REGISTRY.create_processor( model_config, - tokenizer=cached_get_tokenizer(model_config.tokenizer), + tokenizer=cached_tokenizer_from_config(model_config), ) profiler = MultiModalProfiler(processor) @@ -617,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): processor = MULTIMODAL_REGISTRY.create_processor( model_config, - tokenizer=cached_get_tokenizer(model_config.tokenizer), + tokenizer=cached_tokenizer_from_config(model_config), ) rng = np.random.RandomState(0) @@ -689,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): processor = MULTIMODAL_REGISTRY.create_processor( model_config, - tokenizer=cached_get_tokenizer(model_config.tokenizer), + tokenizer=cached_tokenizer_from_config(model_config), ) orig_get_hf_processor = processor.info.get_hf_processor diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 87b7a7631e42e..691fcd7dc53f2 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -11,8 +11,9 @@ from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger -from vllm.transformers_utils.processor import cached_get_processor -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.processor import cached_processor_from_config +from vllm.transformers_utils.tokenizer import (AnyTokenizer, + cached_tokenizer_from_config) from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides, resolve_mm_processor_kwargs) @@ -27,19 +28,9 @@ logger = init_logger(__name__) -C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig) -P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin) - - -class HashableDict(dict): - """ - A dictionary that can be hashed by lru_cache. - """ - - # NOTE: pythonic dict is not hashable, - # we override on it directly for simplicity - def __hash__(self) -> int: # type: ignore[override] - return hash(frozenset(self.items())) +_T = TypeVar("_T") +_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig) +_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) @dataclass(frozen=True) @@ -54,9 +45,9 @@ class InputContext: def get_hf_config( self, - typ: Union[type[C], tuple[type[C], ...]] = PretrainedConfig, + typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig, /, - ) -> C: + ) -> _C: """ Get the HuggingFace configuration (:class:`transformers.PretrainedConfig`) of the model, @@ -94,10 +85,10 @@ def get_mm_config(self): def get_hf_processor( self, - typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin, + typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin, /, **kwargs: object, - ) -> P: + ) -> _P: """ Get the HuggingFace processor (:class:`transformers.ProcessorMixin`) of the model, @@ -106,33 +97,29 @@ def get_hf_processor( Raises: TypeError: If the processor is not of the specified type. """ + return cached_processor_from_config( + self.model_config, + processor_cls=typ, + **kwargs, + ) + + def init_processor( + self, + typ: type[_T], + /, + **kwargs: object, + ) -> _T: + """ + Initialize a HuggingFace-like processor class, merging the + keyword arguments with those in the model's configuration. + """ base_kwargs = self.model_config.mm_processor_kwargs if base_kwargs is None: base_kwargs = {} merged_kwargs = {**base_kwargs, **kwargs} - if isinstance(typ, type): - merged_kwargs["processor_cls"] = typ - - # NOTE: Pythonic dict is not hashable and will raise unhashable type - # error when calling `cached_get_processor`, therefore we need to - # wrap it to a hashable dict. - for key, value in merged_kwargs.items(): - if isinstance(value, dict): - merged_kwargs[key] = HashableDict(value) - - hf_processor = cached_get_processor( - self.model_config.model, - trust_remote_code=self.model_config.trust_remote_code, - **merged_kwargs, - ) - if not isinstance(hf_processor, typ): - raise TypeError("Invalid type of HuggingFace processor. " - f"Expected type: {typ}, but " - f"found type: {type(hf_processor)}") - - return hf_processor + return typ(**merged_kwargs) @dataclass(frozen=True) @@ -142,10 +129,10 @@ class InputProcessingContext(InputContext): def get_hf_processor( self, - typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin, + typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin, /, **kwargs: object, - ) -> P: + ) -> _P: return super().get_hf_processor( typ, tokenizer=self.tokenizer, @@ -341,13 +328,9 @@ def dummy_data_for_profiling( from vllm.model_executor.model_loader import get_model_architecture from vllm.multimodal import MultiModalKwargs from vllm.multimodal.profiling import MultiModalProfiler - from vllm.multimodal.utils import cached_get_tokenizer if mm_registry.has_processor(model_config): - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, - ) + tokenizer = cached_tokenizer_from_config(model_config) processor = mm_registry.create_processor(model_config, tokenizer) profiler = MultiModalProfiler(processor) dummy_data = profiler.get_dummy_data( diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index df73a3b76b1fc..bff4100a1deef 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -400,8 +400,8 @@ def get_hf_config(self): def get_vision_config(self): return self.get_hf_config().vision_config - def get_hf_processor(self): - return self.ctx.get_hf_processor(AriaProcessor) + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(AriaProcessor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index b29dd65a8e357..2d4dfab60730f 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -58,8 +58,8 @@ class ChameleonProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(ChameleonConfig) - def get_hf_processor(self): - return self.ctx.get_hf_processor(ChameleonProcessor) + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(ChameleonProcessor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 0eaf3a6201f6b..5f684fa295ad5 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -28,13 +28,13 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs -from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, MlpProjectorConfig, VisionEncoderConfig) from vllm.transformers_utils.processors.deepseek_vl2 import ( DeepseekVLV2Processor) +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP @@ -133,8 +133,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(DeepseekVLV2Config) - def get_hf_processor(self) -> DeepseekVLV2Processor: - return self.ctx.get_hf_processor(DeepseekVLV2Processor) + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -308,13 +308,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.text_config = config.text_config model_config = vllm_config.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - tokenizer_revision=model_config.tokenizer_revision, - trust_remote_code=model_config.trust_remote_code, - ) - self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN) + tokenizer = cached_tokenizer_from_config(model_config) + self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN] self.vision = self._init_vision_module(self.vision_config, quant_config, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 4e0ee6364f861..42a6aa9794271 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -71,8 +71,8 @@ class FuyuProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(FuyuConfig) - def get_hf_processor(self): - return self.ctx.get_hf_processor(FuyuProcessor) + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(FuyuProcessor, **kwargs) def get_image_processor(self) -> FuyuImageProcessor: return self.get_hf_processor().image_processor diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 450421302a190..40010ec559066 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -416,18 +416,15 @@ def __call__( class GLM4VProcessingInfo(BaseProcessingInfo): - def get_tokenizer(self): - tokenizer = self.ctx.tokenizer - assert isinstance(tokenizer, PreTrainedTokenizer) - return tokenizer - def get_hf_config(self): return self.ctx.get_hf_config(ChatGLMConfig) - def get_hf_processor(self) -> GLM4VProcessor: - return GLM4VProcessor( - self.get_hf_config(), - self.get_tokenizer(), + def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor: + return self.ctx.init_processor( + GLM4VProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, ) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 7bda54ea7689b..0f3a2ffe9a13e 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -15,9 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.pooling_metadata import (PoolingMetadata, PoolingTensors) -from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (IntermediateTensors, PoolerOutput, PoolingSequenceGroupOutput) +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config logger = init_logger(__name__) @@ -29,12 +29,7 @@ def __init__(self, model_config: ModelConfig): self.model_config = model_config - tokenizer = cached_get_tokenizer( - self.model_config.tokenizer, - tokenizer_mode=self.model_config.tokenizer_mode, - tokenizer_revision=self.model_config.tokenizer_revision, - trust_remote_code=self.model_config.trust_remote_code, - ) + tokenizer = cached_tokenizer_from_config(self.model_config) # Collect the tokens needed for pattern matching. # "▁<" is different from "_<". The former uses "▁" to indicate that diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index cf3e777a2027f..01b721fa79e1a 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -41,6 +41,7 @@ def resolve_h2ovl_min_max_num( dynamic_image_size: bool, use_thumbnail: bool, ) -> tuple[int, int]: + min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1 max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 if use_thumbnail and max_dynamic_patch != 1: @@ -190,7 +191,7 @@ def image_to_pixel_values_h2ovl( pixel_values1, aspect_ratio1 = _preprocess_image( image, input_size=input_size, - min_num=min_num, + min_num=1, max_num=max_num, use_thumbnail=True, prior_aspect_ratio=None, @@ -199,7 +200,7 @@ def image_to_pixel_values_h2ovl( pixel_values2, _ = _preprocess_image( image, input_size=input_size, - min_num=3, # Hardcoded value + min_num=3, max_num=max_num, use_thumbnail=True, prior_aspect_ratio=aspect_ratio1, @@ -228,6 +229,7 @@ def __init__( config: PretrainedConfig, tokenizer: AnyTokenizer, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, use_msac: Optional[bool] = None, @@ -235,6 +237,7 @@ def __init__( super().__init__( config, tokenizer, + min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, ) @@ -267,11 +270,13 @@ def get_image_repl_full( def resolve_min_max_num( self, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, use_thumbnail: Optional[bool] = None, ) -> tuple[int, int]: - min_dynamic_patch = self.min_dynamic_patch + min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch + is None else min_dynamic_patch) max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch) dynamic_image_size = (self.dynamic_image_size if dynamic_image_size @@ -289,18 +294,21 @@ def resolve_min_max_num( def resolve_target_ratios( self, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, use_thumbnail: Optional[bool] = None, prior_aspect_ratio: Optional[tuple[int, int]] = None, + override_min_num: Optional[int] = None, ) -> list[tuple[int, int]]: min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, use_thumbnail=use_thumbnail, ) - if prior_aspect_ratio: # hardcoded value for second pass of use_msac - min_num = 3 + if override_min_num is not None: + min_num = override_min_num return get_h2ovl_target_ratios( min_num, @@ -322,6 +330,7 @@ def get_num_image_tokens( if use_msac: target_ratios_1 = self.resolve_target_ratios( use_thumbnail=False, # Applied in calculate_targets + override_min_num=1, ) num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets( orig_width=image_width, @@ -334,6 +343,7 @@ def get_num_image_tokens( target_ratios_2 = self.resolve_target_ratios( use_thumbnail=False, # Applied in calculate_targets prior_aspect_ratio=aspect_ratio_1, + override_min_num=3, ) num_patches_2, _, _, _ = calculate_h2ovl_targets( orig_width=image_width, @@ -361,12 +371,14 @@ def get_num_image_tokens( def _images_to_pixel_values_lst( self, images: list[Image.Image], + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, ) -> list[torch.Tensor]: use_msac = self.use_msac if len(images) == 1 else False min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, use_thumbnail=False, # Applied in image_to_pixel_values @@ -389,14 +401,23 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo): def get_hf_processor( self, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, + **kwargs: object, ) -> H2OVLProcessor: - return H2OVLProcessor( - self.get_hf_config(), - self.get_tokenizer(), - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + if min_dynamic_patch is not None: + kwargs["min_dynamic_patch"] = min_dynamic_patch + if max_dynamic_patch is not None: + kwargs["max_dynamic_patch"] = max_dynamic_patch + if dynamic_image_size is not None: + kwargs["dynamic_image_size"] = dynamic_image_size + + return self.ctx.init_processor( + H2OVLProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, ) def get_mm_max_tokens_per_item( diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index fdfabbaafce3f..579253632c81e 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -83,13 +83,15 @@ class Idefics3ImageEmbeddingInputs(TypedDict): class Idefics3ProcessingInfo(BaseProcessingInfo): def get_hf_processor( - self, - *, - size: Optional[Dict[str, int]] = None) -> Idefics3Processor: + self, + *, + size: Optional[Dict[str, int]] = None, + **kwargs: object, + ) -> Idefics3Processor: if size is not None: - return self.ctx.get_hf_processor(Idefics3Processor, size=size) + kwargs["size"] = size - return self.ctx.get_hf_processor(Idefics3Processor) + return self.ctx.get_hf_processor(Idefics3Processor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 380eb40d9eb28..4a60078767763 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -120,6 +120,7 @@ def resolve_internvl_min_max_num( dynamic_image_size: bool, use_thumbnail: bool, ) -> tuple[int, int]: + min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1 max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 if use_thumbnail and max_dynamic_patch != 1: @@ -247,6 +248,7 @@ def __init__( config: PretrainedConfig, tokenizer: AnyTokenizer, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, ) -> None: @@ -258,18 +260,22 @@ def __init__( image_size: int = config.vision_config.image_size patch_size: int = config.vision_config.patch_size - if dynamic_image_size is None: - dynamic_image_size = config.dynamic_image_size - assert isinstance(dynamic_image_size, bool) + if min_dynamic_patch is None: + min_dynamic_patch = config.min_dynamic_patch + assert isinstance(min_dynamic_patch, int) if max_dynamic_patch is None: max_dynamic_patch = config.max_dynamic_patch assert isinstance(max_dynamic_patch, int) + if dynamic_image_size is None: + dynamic_image_size = config.dynamic_image_size + assert isinstance(dynamic_image_size, bool) + self.num_image_token = int( (image_size // patch_size)**2 * (config.downsample_ratio**2)) self.image_size = image_size - self.min_dynamic_patch: int = config.min_dynamic_patch + self.min_dynamic_patch = min_dynamic_patch self.max_dynamic_patch = max_dynamic_patch self.dynamic_image_size = dynamic_image_size self.use_thumbnail: bool = config.use_thumbnail @@ -298,11 +304,13 @@ def get_image_repl_full( def resolve_min_max_num( self, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, use_thumbnail: Optional[bool] = None, ) -> tuple[int, int]: - min_dynamic_patch = self.min_dynamic_patch + min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch + is None else min_dynamic_patch) max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch) dynamic_image_size = (self.dynamic_image_size if dynamic_image_size @@ -320,11 +328,13 @@ def resolve_min_max_num( def resolve_target_ratios( self, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, use_thumbnail: Optional[bool] = None, ) -> list[tuple[int, int]]: min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, use_thumbnail=use_thumbnail, @@ -355,10 +365,12 @@ def get_num_image_tokens( def _images_to_pixel_values_lst( self, images: list[Image.Image], + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, ) -> list[torch.Tensor]: min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, use_thumbnail=False, # Applied in image_to_pixel_values @@ -378,6 +390,7 @@ def __call__( self, text: Optional[Union[str, list[str]]] = None, images: Optional[Union[Image.Image, list[Image.Image]]] = None, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, @@ -396,6 +409,7 @@ def __call__( else: pixel_values_lst = self._images_to_pixel_values_lst( images, + min_dynamic_patch=min_dynamic_patch, max_dynamic_patch=max_dynamic_patch, dynamic_image_size=dynamic_image_size, ) @@ -451,8 +465,10 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): def get_hf_processor( self, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, + **kwargs: object, ) -> BaseInternVLProcessor: raise NotImplementedError @@ -642,14 +658,23 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): def get_hf_processor( self, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, + **kwargs: object, ) -> InternVLProcessor: - return InternVLProcessor( - self.get_hf_config(), - self.get_tokenizer(), - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + if min_dynamic_patch is not None: + kwargs["min_dynamic_patch"] = min_dynamic_patch + if max_dynamic_patch is not None: + kwargs["max_dynamic_patch"] = max_dynamic_patch + if dynamic_image_size is not None: + kwargs["dynamic_image_size"] = dynamic_image_size + + return self.ctx.init_processor( + InternVLProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6a4277adb6bf4..19752ba703f45 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -119,7 +119,7 @@ def get_vision_encoder_info(self): return get_vision_encoder_info(self.get_hf_config()) @abstractmethod - def get_hf_processor(self) -> LlavaLikeProcessor: + def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: @@ -208,8 +208,8 @@ def get_dummy_processor_inputs( class LlavaProcessingInfo(BaseLlavaProcessingInfo): - def get_hf_processor(self): - return self.ctx.get_hf_processor(LlavaProcessor) + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(LlavaProcessor, **kwargs) class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]): @@ -272,8 +272,8 @@ def _get_mm_fields_config( class PixtralHFProcessingInfo(BaseLlavaProcessingInfo): - def get_hf_processor(self): - return self.ctx.get_hf_processor(PixtralProcessor) + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(PixtralProcessor, **kwargs) class PixtralHFMultiModalProcessor( @@ -742,23 +742,24 @@ def load_weights(self, weights: Iterable[Tuple[str, class MantisProcessingInfo(LlavaProcessingInfo): - def get_hf_processor(self): + def get_hf_processor(self, **kwargs: object): hf_config = self.get_hf_config() vision_info = self.get_vision_encoder_info() + kwargs.setdefault("patch_size", vision_info.get_patch_size()) + if Version(TRANSFORMERS_VERSION) < Version("4.48"): # BUG: num_additional_image_tokens = 0 but treated as 1, # so we set vision_feature_select_strategy to None to offset this - vision_feature_select_strategy = None + kwargs.setdefault("vision_feature_select_strategy", None) else: # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 - vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501 + kwargs.setdefault( + "vision_feature_select_strategy", + hf_config.vision_feature_select_strategy, + ) - return self.ctx.get_hf_processor( - LlavaProcessor, - patch_size=vision_info.get_patch_size(), - vision_feature_select_strategy=vision_feature_select_strategy, - ) + return self.ctx.get_hf_processor(LlavaProcessor, **kwargs) class MantisMultiModalProcessor(LlavaMultiModalProcessor): diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 719916642f25c..c39daec709fc3 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -72,8 +72,8 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): def get_hf_config(self) -> LlavaNextLikeConfig: return self.ctx.get_hf_config(LlavaNextConfig) - def get_hf_processor(self): - hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor) + def get_hf_processor(self, **kwargs: object): + hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor, **kwargs) # In case patch_size is omitted from `processor_config.json` # e.g. for E5-V: https://huggingface.co/royokong/e5-v diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 817edcef4ba14..2af3cc05080ad 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -56,8 +56,8 @@ def get_hf_config(self): def get_vision_encoder_info(self): return get_vision_encoder_info(self.get_hf_config()) - def get_hf_processor(self): - return self.ctx.get_hf_processor(LlavaNextVideoProcessor) + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"video": 1} diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 084d4d51ad236..8eb8071e6577a 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -97,8 +97,8 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): def get_hf_config(self) -> LlavaOnevisionLikeConfig: return self.ctx.get_hf_config(LlavaOnevisionConfig) - def get_hf_processor(self): - return self.ctx.get_hf_processor(LlavaOnevisionProcessor) + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 2083e7dc0b83b..97596f9e82c64 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -331,11 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config() - def get_hf_processor( - self, - **kwargs: object, - ): - hf_processor = self.ctx.get_hf_processor() + def get_hf_processor(self, **kwargs: object): + hf_processor = self.ctx.get_hf_processor(**kwargs) # NumPy arrays are considered as Iterable but not Sequence in # https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428 diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 3ca22d346b792..1f8f5b2eb136d 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -94,8 +94,8 @@ class MllamaProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> MllamaConfig: return self.ctx.get_hf_config(MllamaConfig) - def get_hf_processor(self) -> MllamaProcessor: - return self.ctx.get_hf_processor(MllamaProcessor) + def get_hf_processor(self, **kwargs: object) -> MllamaProcessor: + return self.ctx.get_hf_processor(MllamaProcessor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index b2154ef54af38..1d84d25c96acb 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1200,8 +1200,8 @@ def __call__( class MolmoProcessingInfo(BaseProcessingInfo): - def get_hf_processor(self) -> MolmoProcessorWrapper: - processor = self.ctx.get_hf_processor() + def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper: + processor = self.ctx.get_hf_processor(**kwargs) return MolmoProcessorWrapper(processor) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 9c674ab464463..5de8eeb3fffed 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -69,14 +69,23 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo): def get_hf_processor( self, *, + min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, + **kwargs: object, ) -> NVLMProcessor: - return NVLMProcessor( - self.get_hf_config(), - self.get_tokenizer(), - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + if min_dynamic_patch is not None: + kwargs["min_dynamic_patch"] = min_dynamic_patch + if max_dynamic_patch is not None: + kwargs["max_dynamic_patch"] = max_dynamic_patch + if dynamic_image_size is not None: + kwargs["dynamic_image_size"] = dynamic_image_size + + return self.ctx.init_processor( + NVLMProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, ) def get_max_image_tokens(self) -> int: diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 65d810dc23bc6..955a59953eb4a 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -16,8 +16,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from .interfaces import SupportsMultiModal, SupportsPP from .siglip import (SiglipVisionModel, dummy_image_for_siglip, @@ -88,7 +88,7 @@ def input_processor_for_paligemma(ctx: InputContext, model_config = ctx.model_config hf_config = ctx.get_hf_config(PaliGemmaConfig) - tokenizer = cached_get_tokenizer(model_config.tokenizer) + tokenizer = cached_tokenizer_from_config(model_config) image_feature_size = hf_config.text_config.num_image_tokens image_token_str = tokenizer.decode(hf_config.image_token_index) bos_token = tokenizer.decode(hf_config.bos_token_id) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 6bbfa40beed1b..207204df20559 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -313,11 +313,12 @@ def get_hf_processor( self, *, num_crops: Optional[int] = None, + **kwargs: object, ) -> ProcessorMixin: if num_crops is not None: - return self.ctx.get_hf_processor(num_crops=num_crops) + kwargs["num_crops"] = num_crops - return self.ctx.get_hf_processor() + return self.ctx.get_hf_processor(**kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 44fca852805ab..273dc3b1cf75f 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -32,9 +32,9 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import NestedTensors, PlaceholderRange -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges) +from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from .interfaces import SupportsMultiModal, SupportsPP from .utils import (init_vllm_registered_model, maybe_prefix, @@ -49,9 +49,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext): - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - tokenizer_mode=ctx.model_config.tokenizer_mode) + tokenizer = cached_tokenizer_from_config(ctx.model_config) mm_encoder = tokenizer.instruct.mm_encoder image_config = mm_encoder.mm_config if hasattr( @@ -65,9 +63,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext): def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - tokenizer_mode=ctx.model_config.tokenizer_mode) + tokenizer = cached_tokenizer_from_config(ctx.model_config) mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder image_token_id = mm_encoder.special_ids.img @@ -109,9 +105,7 @@ def input_mapper_for_pixtral(ctx: InputContext, MultiModalKwargs containing the stacked normalized images tensor or image embeddings. """ - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode) + tokenizer = cached_tokenizer_from_config(ctx.model_config) data_list = data if isinstance(data, list) else [data] @@ -138,9 +132,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): prompt_token_ids = inputs.get("prompt_token_ids") prompt = inputs.get("prompt") - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - tokenizer_mode=ctx.model_config.tokenizer_mode) + tokenizer = cached_tokenizer_from_config(ctx.model_config) mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder image_token_id = mm_encoder.special_ids.img diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 632ecaf65f2fb..29187eb2ef9c1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -36,8 +36,6 @@ from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) -from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, - Qwen2VLImageProcessorFast) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig @@ -690,41 +688,20 @@ def get_hf_processor( *, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, - fps: Optional[float] = 2.0, + size: Optional[dict[str, int]] = None, + fps: Optional[float] = None, + **kwargs: object, ) -> Qwen2_5_VLProcessor: - hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor) - image_processor = hf_processor.image_processor # type: ignore - assert isinstance(image_processor, - (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast)) - - if min_pixels: - image_processor.min_pixels = min_pixels - if max_pixels: - image_processor.max_pixels = max_pixels - if max_pixels or min_pixels: - image_processor.size = { - "min_pixels": image_processor.min_pixels, - "max_pixels": image_processor.max_pixels, - } - - return hf_processor - - def get_image_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - fps: Optional[float] = 2.0, - ) -> Union[Qwen2VLImageProcessor, Qwen2VLImageProcessorFast]: - hf_processor = self.get_hf_processor( - min_pixels=min_pixels, - max_pixels=max_pixels, - fps=fps, + if fps is not None: + kwargs["fps"] = fps + + return self.ctx.get_hf_processor( + Qwen2_5_VLProcessor, + image_processor=self.get_image_processor(min_pixels=min_pixels, + max_pixels=max_pixels, + size=size), + **kwargs, ) - image_processor = hf_processor.image_processor # type: ignore - assert isinstance(image_processor, - (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast)) - return image_processor class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index cf79544e60e87..3df5dd2bdd419 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -93,8 +93,9 @@ def get_hf_processor( *, # Ignored in initialization sampling_rate: Optional[int] = None, + **kwargs: object, ) -> Qwen2AudioProcessor: - return self.ctx.get_hf_processor(Qwen2AudioProcessor) + return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs) def get_feature_extractor( self, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 68340ace18ddd..919445267f4a6 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -31,9 +31,7 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from packaging.version import Version from transformers import BatchFeature -from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, Qwen2VLProcessor) from transformers.models.qwen2_vl.configuration_qwen2_vl import ( @@ -69,6 +67,8 @@ from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope +from vllm.transformers_utils.processor import ( + cached_image_processor_from_config) from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, @@ -722,40 +722,64 @@ def get_hf_processor( *, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, ) -> Qwen2VLProcessor: - hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) - image_processor = hf_processor.image_processor # type: ignore - assert isinstance(image_processor, Qwen2VLImageProcessor) - - if min_pixels: - image_processor.min_pixels = min_pixels - if max_pixels: - image_processor.max_pixels = max_pixels - if max_pixels or min_pixels: - image_processor.size = { - "min_pixels": image_processor.min_pixels, - "max_pixels": image_processor.max_pixels, - } - - return hf_processor + return self.ctx.get_hf_processor( + Qwen2VLProcessor, + image_processor=self.get_image_processor(min_pixels=min_pixels, + max_pixels=max_pixels, + size=size), + **kwargs, + ) + + def _get_image_processor_kwargs( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, + ): + if self.ctx.model_config.mm_processor_kwargs: + kwargs.update(self.ctx.model_config.mm_processor_kwargs) + + if min_pixels is not None: + kwargs["min_pixels"] = min_pixels + + if size is None: + size = {"shortest_edge": min_pixels} + else: + size["shortest_edge"] = min_pixels + + if max_pixels is not None: + kwargs["max_pixels"] = max_pixels + + if size is None: + size = {"longest_edge": max_pixels} + else: + size["longest_edge"] = max_pixels + + if size is not None: + kwargs["size"] = size + + return kwargs def get_image_processor( self, *, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, + size: Optional[dict[str, int]] = None, + **kwargs: object, ): - hf_processor = self.get_hf_processor(min_pixels=min_pixels, - max_pixels=max_pixels) - image_processor = hf_processor.image_processor # type: ignore - if Version(TRANSFORMERS_VERSION) >= Version("4.49"): - from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast - assert isinstance( - image_processor, - (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast)) - else: - assert isinstance(image_processor, Qwen2VLImageProcessor) - return image_processor + return cached_image_processor_from_config( + self.ctx.model_config, + **self._get_image_processor_kwargs(min_pixels=min_pixels, + max_pixels=max_pixels, + size=size, + **kwargs), + ) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -952,6 +976,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] def _get_data_parser(self) -> MultiModalDataParser: return Qwen2VLMultiModalDataParser() + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + return self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + self.info._get_image_processor_kwargs(**mm_kwargs), + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, @@ -964,8 +1000,6 @@ def _get_prompt_replacements( tokenizer = self.info.get_tokenizer() vocab = tokenizer.get_vocab() - # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has - # image_token and video_token registered placeholder = { "image": vocab[hf_processor.image_token], "video": vocab[hf_processor.video_token], diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 0f4f5072fb2b4..61a4584abf852 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -519,8 +519,13 @@ def get_tokenizer(self) -> PreTrainedTokenizer: return _get_tokenizer_without_image_pad(tokenizer) - def get_hf_processor(self) -> QwenVLProcessor: - return QwenVLProcessor(self.get_hf_config(), self.get_tokenizer()) + def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor: + return self.ctx.init_processor( + QwenVLProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, + ) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 063997a14a66f..e24b4aeb8ae84 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -68,8 +68,9 @@ def get_hf_processor( *, # Ignored in initialization sampling_rate: Optional[int] = None, + **kwargs: object, ) -> ProcessorMixin: - hf_processor = self.ctx.get_hf_processor() + hf_processor = self.ctx.get_hf_processor(**kwargs) # NOTE: Ultravox processing definition uses '<|eot_id|>' as the # placeholder that will cause confusion with the actual end of turn diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 0b506072094e3..073a30d25e239 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -29,7 +29,7 @@ NestedTensors) from vllm.multimodal.audio import resample_audio from vllm.sequence import SequenceData -from vllm.transformers_utils.processor import cached_get_processor +from vllm.transformers_utils.processor import cached_processor_from_config from .interfaces import SupportsMultiModal, SupportsTranscription from .utils import AutoWeightsLoader, WeightsMapper, make_layers @@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): assert mm_counts["audio"] == 1 num_tokens = get_max_whisper_audio_tokens(ctx) - processor = cached_get_processor(ctx.model_config.model) + processor = cached_processor_from_config(ctx.model_config) chunk_length = processor.feature_extractor.chunk_length sampling_rate = processor.feature_extractor.sampling_rate num_samples = chunk_length * sampling_rate @@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs): multi_modal_data["audio"] = multi_modal_data["audio"][0] # Resample and process audio audio, orig_sr = multi_modal_data["audio"] - processor = cached_get_processor(ctx.model_config.model) + processor = cached_processor_from_config(ctx.model_config) target_sr = processor.feature_extractor.sampling_rate audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr) multi_modal_data["audio"] = (audio, target_sr) @@ -618,7 +618,7 @@ def input_mapper_for_whisper( if len(multi_modal_data) == 0: return MultiModalKwargs() - processor = cached_get_processor(ctx.model_config.model) + processor = cached_processor_from_config(ctx.model_config) sampling_rate = processor.feature_extractor.sampling_rate audios = [audio for audio, _ in multi_modal_data] diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 98ac8057e8f18..98ece8f806f1d 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import base64 -from functools import lru_cache from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional @@ -11,7 +10,7 @@ from vllm.inputs.registry import InputContext from vllm.logger import init_logger -from vllm.transformers_utils.processor import get_image_processor +from vllm.transformers_utils.processor import cached_get_image_processor from vllm.utils import is_list_of from .base import MediaIO, MultiModalPlugin @@ -22,8 +21,6 @@ logger = init_logger(__name__) -cached_get_image_processor = lru_cache(get_image_processor) - class ImagePlugin(MultiModalPlugin): """Plugin for image data.""" diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 613d1db416720..1882ffe9bf69f 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -11,7 +11,8 @@ from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE from vllm.inputs import InputProcessingContext from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import (AnyTokenizer, + cached_tokenizer_from_config) from vllm.utils import ClassRegistry from .audio import AudioPlugin @@ -21,7 +22,6 @@ from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, ProcessingCache) from .profiling import BaseDummyInputsBuilder, MultiModalProfiler -from .utils import cached_get_tokenizer from .video import VideoPlugin if TYPE_CHECKING: @@ -256,10 +256,7 @@ def get_max_tokens_per_item_by_modality( on underlying model configuration. """ if self.has_processor(model_config): - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, - ) + tokenizer = cached_tokenizer_from_config(model_config) processor = self.create_processor(model_config, tokenizer) seq_len = model_config.max_model_len mm_limits = self.get_mm_limits_per_prompt(model_config) @@ -374,10 +371,7 @@ def get_mm_limits_per_prompt( This should be called after :meth:`init_mm_limits_per_prompt`. """ if self.has_processor(model_config): - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, - ) + tokenizer = cached_tokenizer_from_config(model_config) processor = self.create_processor(model_config, tokenizer) profiler = MultiModalProfiler(processor) return profiler.get_mm_limits() diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 583f536551243..6e6c10b34a25f 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from functools import lru_cache from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING, Optional, TypeVar, Union @@ -13,7 +12,7 @@ import vllm.envs as envs from vllm.connections import HTTPConnection, global_http_connection from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer from .audio import AudioMediaIO from .base import MediaIO @@ -23,8 +22,6 @@ logger = init_logger(__name__) -cached_get_tokenizer = lru_cache(get_tokenizer) - _M = TypeVar("_M") if TYPE_CHECKING: diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 78a2918e3ed3e..8004377191b38 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import base64 -from functools import lru_cache, partial +from functools import partial from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional @@ -12,8 +12,7 @@ from vllm.inputs.registry import InputContext from vllm.logger import init_logger -from vllm.transformers_utils.processor import get_video_processor -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.transformers_utils.processor import cached_get_video_processor from vllm.utils import PlaceholderModule, is_list_of from .base import MediaIO, ModalityData @@ -30,9 +29,6 @@ logger = init_logger(__name__) -cached_get_video_processor = lru_cache(get_video_processor) -cached_get_tokenizer = lru_cache(get_tokenizer) - class VideoPlugin(ImagePlugin): """Plugin for video data.""" diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 3197b07d8a468..29fab16c25c11 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,25 +1,59 @@ # SPDX-License-Identifier: Apache-2.0 from functools import lru_cache -from typing import Any, cast +from typing import TYPE_CHECKING, Any, Union, cast from transformers.processing_utils import ProcessorMixin +from typing_extensions import TypeVar + +if TYPE_CHECKING: + from vllm.config import ModelConfig + +_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) + + +class HashableDict(dict): + """ + A dictionary that can be hashed by lru_cache. + """ + + # NOTE: pythonic dict is not hashable, + # we override on it directly for simplicity + def __hash__(self) -> int: # type: ignore[override] + return hash(frozenset(self.items())) + + +def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs): + base_kwargs = model_config.mm_processor_kwargs + if base_kwargs is None: + base_kwargs = {} + + merged_kwargs = {**base_kwargs, **kwargs} + + # NOTE: Pythonic dict is not hashable and will raise unhashable type + # error when calling `cached_get_processor`, therefore we need to + # wrap it to a hashable dict. + for key, value in merged_kwargs.items(): + if isinstance(value, dict): + merged_kwargs[key] = HashableDict(value) + + return merged_kwargs def get_processor( processor_name: str, *args: Any, trust_remote_code: bool = False, - processor_cls: type[ProcessorMixin] = ProcessorMixin, + processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin, **kwargs: Any, -): +) -> _P: """Load a processor for the given model name via HuggingFace.""" # don't put this import at the top level # it will call torch.cuda.device_count() from transformers import AutoProcessor - processor_factory = (AutoProcessor - if processor_cls == ProcessorMixin else processor_cls) + processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or + isinstance(processor_cls, tuple) else processor_cls) try: processor = processor_factory.from_pretrained( @@ -43,12 +77,30 @@ def get_processor( else: raise e - return cast(ProcessorMixin, processor) + if not isinstance(processor, processor_cls): + raise TypeError("Invalid type of HuggingFace processor. " + f"Expected type: {processor_cls}, but " + f"found type: {type(processor)}") + + return processor cached_get_processor = lru_cache(get_processor) +def cached_processor_from_config( + model_config: "ModelConfig", + processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin, + **kwargs: Any, +) -> _P: + return cached_get_processor( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + processor_cls=processor_cls, # type: ignore[arg-type] + **_merge_mm_kwargs(model_config, **kwargs), + ) + + def get_image_processor( processor_name: str, *args: Any, @@ -85,6 +137,20 @@ def get_image_processor( return cast(BaseImageProcessor, processor) +cached_get_image_processor = lru_cache(get_image_processor) + + +def cached_image_processor_from_config( + model_config: "ModelConfig", + **kwargs: Any, +): + return cached_get_image_processor( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + **_merge_mm_kwargs(model_config, **kwargs), + ) + + def get_video_processor( processor_name: str, *args: Any, @@ -104,3 +170,17 @@ def get_video_processor( ) return cast(BaseImageProcessor, processor.video_processor) + + +cached_get_video_processor = lru_cache(get_video_processor) + + +def cached_video_processor_from_config( + model_config: "ModelConfig", + **kwargs: Any, +): + return cached_get_video_processor( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + **_merge_mm_kwargs(model_config, **kwargs), + ) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 0c0f68ac123e2..f0aa5fdcaa61f 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -3,9 +3,10 @@ import contextlib import os import warnings +from functools import lru_cache from pathlib import Path from types import MethodType -from typing import Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import huggingface_hub from transformers import (AutoTokenizer, PreTrainedTokenizer, @@ -20,6 +21,9 @@ from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import make_async +if TYPE_CHECKING: + from vllm.config import ModelConfig + logger = init_logger(__name__) AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, @@ -232,6 +236,22 @@ def get_tokenizer( return tokenizer +cached_get_tokenizer = lru_cache(get_tokenizer) + + +def cached_tokenizer_from_config( + model_config: "ModelConfig", + **kwargs: Any, +): + return cached_get_tokenizer( + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + tokenizer_revision=model_config.tokenizer_revision, + trust_remote_code=model_config.trust_remote_code, + **kwargs, + ) + + def get_lora_tokenizer(lora_request: LoRARequest, *args, **kwargs) -> Optional[AnyTokenizer]: if lora_request is None: