Skip to content

Commit 09e974d

Browse files
[Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent e5ef4fa commit 09e974d

File tree

14 files changed

+98
-37
lines changed

14 files changed

+98
-37
lines changed

examples/offline_inference/vision_language.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
6868
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
6969
prompts = [f"Question: {question} Answer:" for question in questions]
7070
engine_args = EngineArgs(
71-
model="Salesforce/blip2-opt-2.7b",
71+
model="Salesforce/blip2-opt-6.7b",
7272
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
7373
)
7474

@@ -128,7 +128,8 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
128128
engine_args = EngineArgs(
129129
model="microsoft/Florence-2-large",
130130
tokenizer="facebook/bart-large",
131-
max_num_seqs=8,
131+
max_model_len=4096,
132+
max_num_seqs=2,
132133
trust_remote_code=True,
133134
dtype="bfloat16",
134135
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
@@ -511,7 +512,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
511512
engine_args = EngineArgs(
512513
model=model_name,
513514
max_model_len=4096,
514-
max_num_seqs=16,
515+
max_num_seqs=2,
515516
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
516517
)
517518

@@ -700,7 +701,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
700701
# NOTE: Need L40 (or equivalent) to avoid OOM
701702
engine_args = EngineArgs(
702703
model=model_name,
703-
max_model_len=8192,
704+
max_model_len=6144,
704705
max_num_seqs=2,
705706
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
706707
)

tests/distributed/test_pipeline_parallel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def iter_params(self, model_id: str):
217217

218218
MULTIMODAL_MODELS = {
219219
# [Decoder-only]
220-
"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
220+
"Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
221221
"facebook/chameleon-7b": PPTestSettings.fast(),
222222
"adept/fuyu-8b": PPTestSettings.fast(),
223223
"THUDM/glm-4v-9b": PPTestSettings.fast(),

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@
3434
# V1 Test: no way to fall back for head_dim = 80
3535
# https://github.com/vllm-project/vllm/issues/14524
3636
"qwen_vl",
37-
"h2ovl",
38-
"blip2",
3937
# V1 Test: not enough KV cache space in C1.
4038
"fuyu",
4139
]
@@ -161,7 +159,8 @@
161159
marks=[large_gpu_mark(min_gb=64)],
162160
),
163161
"blip2": VLMTestInfo(
164-
models=["Salesforce/blip2-opt-2.7b"],
162+
# TODO: Change back to 2.7b once head_dim = 80 is supported
163+
models=["Salesforce/blip2-opt-6.7b"],
165164
test_type=VLMTestType.IMAGE,
166165
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
167166
img_idx_to_prompt=lambda idx: "",
@@ -248,7 +247,8 @@
248247
"h2ovl": VLMTestInfo(
249248
models = [
250249
"h2oai/h2ovl-mississippi-800m",
251-
"h2oai/h2ovl-mississippi-2b",
250+
# TODO: Re-enable once head_dim = 80 is supported
251+
# "h2oai/h2ovl-mississippi-2b",
252252
],
253253
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
254254
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501

tests/models/registry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,8 @@ def check_available_online(
259259
_MULTIMODAL_EXAMPLE_MODELS = {
260260
# [Decoder-only]
261261
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
262-
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501
262+
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
263+
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
263264
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
264265
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
265266
extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501

vllm/model_executor/models/florence2.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,8 @@ def _get_prompt_updates(
875875
Florence2MultiModalProcessor,
876876
info=Florence2ProcessingInfo,
877877
dummy_inputs=Florence2DummyInputsBuilder)
878-
class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
878+
class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
879+
SupportsV0Only):
879880

880881
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
881882
super().__init__()

vllm/model_executor/models/fuyu.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
PromptUpdate, PromptUpdateDetails)
4040
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
4141
from vllm.sequence import IntermediateTensors
42-
from vllm.utils import flatten_2d_lists
4342

4443
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
4544
from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -66,10 +65,13 @@ class FuyuImagePatchInputs(TypedDict):
6665
This is used to split the embeddings which has the first two dimensions
6766
flattened just like `flat_data`.
6867
"""
68+
6969
embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
7070
"""
7171
A boolean mask indicating which image embeddings correspond
7272
to patch tokens.
73+
74+
Shape: `(batch_size * num_images, num_embeds)`
7375
"""
7476

7577

@@ -322,16 +324,18 @@ def _validate_shape(d: torch.Tensor):
322324
def _parse_and_validate_image_input(
323325
self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
324326
image_patches = kwargs.pop("image_patches", None)
325-
embed_is_patch = kwargs.pop("embed_is_patch", None)
326327
if image_patches is not None:
327328
if not isinstance(image_patches, (torch.Tensor, list)):
328329
raise ValueError("Incorrect type of image patches. "
329330
f"Got type: {type(image_patches)}")
330331

332+
embed_is_patch = kwargs.pop("embed_is_patch")
331333
if not isinstance(embed_is_patch, (torch.Tensor, list)):
332334
raise ValueError("Incorrect type of embed_is_patch. "
333335
f"Got type: {type(embed_is_patch)}")
336+
334337
image_patches_flat = flatten_bn(image_patches)
338+
embed_is_patch = flatten_bn(embed_is_patch)
335339

336340
return FuyuImagePatchInputs(
337341
type="image_patches",
@@ -351,20 +355,21 @@ def _process_image_input(
351355
assert self.vision_embed_tokens is not None
352356
vision_embeddings_flat, _ = self.vision_embed_tokens(
353357
image_patches_flat)
358+
354359
return vision_embeddings_flat.split(patches_per_image, dim=0)
355360

356361
def get_multimodal_embeddings(
357362
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
358363
image_input = self._parse_and_validate_image_input(**kwargs)
359364
if image_input is None:
360365
return None
361-
vision_embeddings = self._process_image_input(image_input)
362-
#return vision_embeddings
363-
return flatten_2d_lists(
364-
scatter_patch_features(*args) for args in zip(
365-
vision_embeddings,
366-
image_input["embed_is_patch"],
367-
))
366+
367+
image_features = self._process_image_input(image_input)
368+
369+
return scatter_patch_features(
370+
image_features,
371+
image_input["embed_is_patch"],
372+
)
368373

369374
def get_input_embeddings(
370375
self,

vllm/model_executor/models/gemma3_mm.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ def _image_pixels_to_features(
613613
def _process_image_input(
614614
self,
615615
image_input: Gemma3ImageInputs,
616-
) -> tuple[torch.Tensor, ...]:
616+
) -> list[torch.Tensor]:
617617
assert self.vision_tower is not None
618618

619619
pixel_values = image_input["pixel_values"]
@@ -625,7 +625,9 @@ def _process_image_input(
625625
)
626626
image_embeds = self.multi_modal_projector(image_features)
627627

628-
return image_embeds.split(num_patches.tolist())
628+
return [
629+
e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
630+
]
629631

630632
def get_multimodal_embeddings(
631633
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:

vllm/model_executor/models/idefics3.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -733,15 +733,20 @@ def _process_image_pixels(
733733
pixel_attention_mask=pixel_attention_mask,
734734
)
735735

736-
def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
736+
def _process_image_input(
737+
self,
738+
image_input: ImageInputs,
739+
) -> Union[torch.Tensor, list[torch.Tensor]]:
737740
if image_input["type"] == "image_embeds":
738741
return image_input["data"]
739742

740743
image_features = self._process_image_pixels(image_input)
741744
image_features = self.model.connector(image_features)
742745

743746
num_patches = image_input["num_patches"]
744-
return image_features.split(num_patches.tolist())
747+
return [
748+
e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
749+
]
745750

746751
def get_multimodal_embeddings(
747752
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:

vllm/model_executor/models/llava_next_video.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -406,20 +406,21 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
406406
h, w)
407407
stacked_embeddings = self._video_pixels_to_features(
408408
self.vision_tower, stacked_pixels)
409-
return stacked_embeddings.view(b, num_frames,
410-
*stacked_embeddings.shape[1:])
409+
embeds = stacked_embeddings.view(b, num_frames,
410+
*stacked_embeddings.shape[1:])
411411

412412
elif is_list_of(video_pixels, torch.Tensor):
413413
frames_per_videos = [v.shape[0] for v in video_pixels]
414414
stacked_pixels = torch.cat(video_pixels, dim=0)
415415
stacked_embeddings = self._video_pixels_to_features(
416416
self.vision_tower, stacked_pixels)
417-
return torch.split(stacked_embeddings, frames_per_videos, dim=0)
418-
417+
embeds = torch.split(stacked_embeddings, frames_per_videos, dim=0)
419418
else:
420419
raise ValueError(
421420
f"Unsupported type of video input {type(video_pixels)}")
422421

422+
return [e.flatten(0, 1) for e in embeds]
423+
423424
def get_multimodal_embeddings(
424425
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
425426
video_input = self._parse_and_validate_video_input(**kwargs)

vllm/model_executor/models/minicpmv.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -919,8 +919,11 @@ def _process_vision_input(
919919

920920
image_features_flat = self.get_vision_hidden_states(image_input)
921921

922-
# Reconstruct the batch dimension
923-
return image_features_flat.split(image_input["num_slices"].tolist())
922+
num_slices = image_input["num_slices"]
923+
return [
924+
e.flatten(0, 1)
925+
for e in image_features_flat.split(num_slices.tolist())
926+
]
924927

925928
def _process_multimodal_inputs(self, modalities: dict):
926929
# The result multimodal_embeddings is tuple of tensors, with each

0 commit comments

Comments
 (0)