Skip to content

Commit 35dcf9b

Browse files
DarkLight1337xuebwang-amd
authored andcommitted
[Bugfix] Standardize merging multimodal embeddings (vllm-project#26771)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: xuebwang-amd <xuebwang@amd.com>
1 parent 4be3401 commit 35dcf9b

19 files changed

+57
-57
lines changed

vllm/model_executor/models/ernie45_vl.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,12 +1645,12 @@ def get_multimodal_embeddings(
16451645
for modality in modalities:
16461646
if modality == "images":
16471647
image_input = modalities["images"]
1648-
vision_embeddings = self._process_image_input(image_input)
1649-
multimodal_embeddings += vision_embeddings
1648+
image_embeddings = self._process_image_input(image_input)
1649+
multimodal_embeddings += tuple(image_embeddings)
16501650
if modality == "videos":
16511651
video_input = modalities["videos"]
16521652
video_embeddings = self._process_video_input(video_input)
1653-
multimodal_embeddings += video_embeddings
1653+
multimodal_embeddings += tuple(video_embeddings)
16541654

16551655
return multimodal_embeddings
16561656

vllm/model_executor/models/glm4_1v.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1608,11 +1608,11 @@ def get_multimodal_embeddings(
16081608
for modality in mm_input_by_modality:
16091609
multimodal_input = mm_input_by_modality[modality]
16101610
if modality == "image":
1611-
vision_embeddings = self._process_image_input(multimodal_input)
1612-
multimodal_embeddings += vision_embeddings
1611+
image_embeddings = self._process_image_input(multimodal_input)
1612+
multimodal_embeddings += tuple(image_embeddings)
16131613
if modality == "video":
16141614
video_embeddings = self._process_video_input(multimodal_input)
1615-
multimodal_embeddings += video_embeddings
1615+
multimodal_embeddings += tuple(video_embeddings)
16161616
return multimodal_embeddings
16171617

16181618
def forward(

vllm/model_executor/models/hyperclovax_vision.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -749,12 +749,12 @@ def get_multimodal_embeddings(
749749
for modality in modalities:
750750
if modality == "images":
751751
image_input = modalities["images"]
752-
vision_embeddings = self._process_image_input(image_input)
753-
multimodal_embeddings += vision_embeddings
752+
image_embeddings = self._process_image_input(image_input)
753+
multimodal_embeddings += tuple(image_embeddings)
754754
if modality == "videos":
755755
video_input = modalities["videos"]
756756
video_embeddings = self._process_video_input(video_input)
757-
multimodal_embeddings += video_embeddings
757+
multimodal_embeddings += tuple(video_embeddings)
758758

759759
return multimodal_embeddings
760760

vllm/model_executor/models/interns1.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -753,12 +753,12 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
753753
for modality in modalities:
754754
if modality == "images":
755755
image_input = modalities["images"]
756-
vision_embeddings = self._process_vision_input(image_input)
757-
multimodal_embeddings += vision_embeddings
756+
image_embeddings = self._process_vision_input(image_input)
757+
multimodal_embeddings += tuple(image_embeddings)
758758
if modality == "videos":
759759
video_input = modalities["videos"]
760760
video_embeddings = self._process_vision_input(video_input)
761-
multimodal_embeddings += video_embeddings
761+
multimodal_embeddings += tuple(video_embeddings)
762762

763763
return multimodal_embeddings
764764

vllm/model_executor/models/internvl.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,12 +1358,12 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
13581358
for modality in modalities:
13591359
if modality == "images":
13601360
image_input = modalities["images"]
1361-
vision_embeddings = self._process_vision_input(image_input)
1362-
multimodal_embeddings += vision_embeddings
1361+
image_embeddings = self._process_vision_input(image_input)
1362+
multimodal_embeddings += tuple(image_embeddings)
13631363
if modality == "videos":
13641364
video_input = modalities["videos"]
13651365
video_embeddings = self._process_vision_input(video_input)
1366-
multimodal_embeddings += video_embeddings
1366+
multimodal_embeddings += tuple(video_embeddings)
13671367

13681368
return multimodal_embeddings
13691369

vllm/model_executor/models/keye.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,12 +1459,12 @@ def get_multimodal_embeddings(
14591459
for modality in modalities:
14601460
if modality == "images":
14611461
image_input = modalities["images"]
1462-
vision_embeddings = self._process_image_input(image_input)
1463-
multimodal_embeddings += vision_embeddings
1462+
image_embeddings = self._process_image_input(image_input)
1463+
multimodal_embeddings += tuple(image_embeddings)
14641464
if modality == "videos":
14651465
video_input = modalities["videos"]
14661466
video_embeddings = self._process_video_input(video_input)
1467-
multimodal_embeddings += video_embeddings
1467+
multimodal_embeddings += tuple(video_embeddings)
14681468
return multimodal_embeddings
14691469

14701470
def forward(

vllm/model_executor/models/llava_onevision.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -881,8 +881,8 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
881881
for modality in mm_input_by_modality:
882882
multimodal_input = mm_input_by_modality[modality]
883883
if modality == "image":
884-
vision_embeddings = self._process_image_input(multimodal_input)
885-
multimodal_embeddings += tuple(vision_embeddings)
884+
image_embeddings = self._process_image_input(multimodal_input)
885+
multimodal_embeddings += tuple(image_embeddings)
886886
if modality == "video":
887887
video_embeddings = self._process_video_pixels(multimodal_input)
888888
multimodal_embeddings += tuple(video_embeddings)

vllm/model_executor/models/minicpmo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -762,7 +762,7 @@ def _process_multimodal_inputs(self, modalities: dict):
762762
for modality in modalities:
763763
if modality == "audios":
764764
audio_input = modalities["audios"]
765-
audio_features = self._process_audio_input(audio_input)
766-
multimodal_embeddings += tuple(audio_features)
765+
audio_embeddings = self._process_audio_input(audio_input)
766+
multimodal_embeddings += tuple(audio_embeddings)
767767

768768
return multimodal_embeddings

vllm/model_executor/models/minicpmv.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,12 +1129,12 @@ def _process_multimodal_inputs(self, modalities: dict):
11291129
for modality in modalities:
11301130
if modality == "images":
11311131
image_input = modalities["images"]
1132-
image_features = self._process_vision_input(image_input)
1133-
multimodal_embeddings += tuple(image_features)
1132+
image_embeddings = self._process_vision_input(image_input)
1133+
multimodal_embeddings += tuple(image_embeddings)
11341134
if modality == "videos":
11351135
video_input = modalities["videos"]
1136-
video_features = self._process_vision_input(video_input)
1137-
multimodal_embeddings += tuple(video_features)
1136+
video_embeddings = self._process_vision_input(video_input)
1137+
multimodal_embeddings += tuple(video_embeddings)
11381138

11391139
return multimodal_embeddings
11401140

vllm/model_executor/models/nano_nemotron_vl.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,12 +1263,12 @@ def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
12631263
for modality in modalities:
12641264
if modality == "images":
12651265
image_input = modalities["images"]
1266-
vision_embeddings = self._process_image_input(image_input)
1267-
multimodal_embeddings += vision_embeddings
1266+
image_embeddings = self._process_image_input(image_input)
1267+
multimodal_embeddings += tuple(image_embeddings)
12681268
if modality == "videos":
12691269
video_input = modalities["videos"]
12701270
video_embeddings = self._process_video_input(video_input)
1271-
multimodal_embeddings += video_embeddings
1271+
multimodal_embeddings += tuple(video_embeddings)
12721272

12731273
return multimodal_embeddings
12741274

0 commit comments

Comments
 (0)