Skip to content

Commit 73e0225

Browse files
[Bugfix] Check that number of images matches number of <|image|> tokens with mllama (#13911)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
1 parent 6c85da3 commit 73e0225

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

tests/models/encoder_decoder/vision_language/test_mllama.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,8 +479,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
479479

480480
# Regression tests for https://github.com/vllm-project/vllm/issues/10648
481481

482-
# Number of image tags is greater than the number of images provided
483-
prompt = "<|begin_of_text|><|image|><|image|> Compare the two images" # noqa: E501
482+
# Number of groups of image tokens is greater than the number of images
483+
# provided (the whitespace between the tags is necessary)
484+
prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501
484485
image = stop_sign
485486
with pytest.raises(ValueError):
486487
vllm_model.generate_greedy_logprobs([prompt],

vllm/model_executor/models/mllama.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
default_weight_loader, maybe_remap_kv_scale_name)
5555
from vllm.model_executor.sampling_metadata import SamplingMetadata
5656
from vllm.multimodal import MULTIMODAL_REGISTRY
57-
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
57+
from vllm.multimodal.inputs import (MultiModalEncDecInputs,
58+
MultiModalFieldConfig, MultiModalKwargs)
5859
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
5960
MultiModalDataDict, MultiModalDataItems)
6061
from vllm.multimodal.processing import (BaseProcessingInfo,
@@ -169,6 +170,27 @@ def get_dummy_processor_inputs(
169170
class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
170171
):
171172

173+
def apply(
174+
self,
175+
prompt: Union[str, list[int]],
176+
mm_data: MultiModalDataDict,
177+
hf_processor_mm_kwargs: Mapping[str, object],
178+
) -> MultiModalEncDecInputs:
179+
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
180+
181+
# Check that the number of image tokens in the decoder prompt matches
182+
# the number of images provided in mm_data
183+
num_image_tokens = mm_inputs['prompt_token_ids'].count(
184+
self.info.get_hf_config().image_token_index)
185+
image_data = mm_data.get("image", [])
186+
num_images = 1 if isinstance(image_data, Image) else len(image_data)
187+
if num_image_tokens != num_images:
188+
raise ValueError(
189+
f"The number of image tokens ({num_image_tokens}) must be"
190+
f" the same as the number of images ({num_images})")
191+
192+
return mm_inputs
193+
172194
def _call_hf_processor(
173195
self,
174196
prompt: str,

0 commit comments

Comments
 (0)