Skip to content

Commit 2a91bc1

Browse files
NickLucchezRzRzRzRzRzRzR
authored andcommitted
[Model] Add SupportsMultiModal.get_language_model interface (vllm-project#16007)
Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
1 parent c8e2b50 commit 2a91bc1

33 files changed

+116
-0
lines changed

docs/source/contributing/model/multimodal.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,17 @@ Further update the model as follows:
7979
return inputs_embeds
8080
```
8181

82+
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
83+
84+
```python
85+
class YourModelForImage2Seq(nn.Module):
86+
...
87+
88+
def get_language_model(self) -> torch.nn.Module:
89+
# Change `language_model` according to your implementation.
90+
return self.language_model
91+
```
92+
8293
- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
8394

8495
```diff

vllm/model_executor/models/aria.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,9 @@ def _process_image_input(
605605

606606
return self.multi_modal_projector(image_outputs, image_attn_mask)
607607

608+
def get_language_model(self) -> torch.nn.Module:
609+
return self.language_model
610+
608611
def get_multimodal_embeddings(
609612
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
610613
image_input = self._parse_and_validate_image_input(**kwargs)

vllm/model_executor/models/aya_vision.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,9 @@ def _parse_and_validate_image_input(
424424
num_patches=num_patches,
425425
)
426426

427+
def get_language_model(self) -> torch.nn.Module:
428+
return self.language_model
429+
427430
def get_multimodal_embeddings(
428431
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
429432
image_input = self._parse_and_validate_image_input(**kwargs)

vllm/model_executor/models/blip2.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,9 @@ def _process_image_input(self,
627627

628628
return self.language_projection(query_output)
629629

630+
def get_language_model(self) -> torch.nn.Module:
631+
return self.language_model
632+
630633
def get_multimodal_embeddings(
631634
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
632635
image_input = self._parse_and_validate_image_input(**kwargs)

vllm/model_executor/models/chameleon.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -988,6 +988,9 @@ def _parse_and_validate_image_input(
988988
data=self._validate_pixel_values(pixel_values),
989989
)
990990

991+
def get_language_model(self) -> torch.nn.Module:
992+
return self.model
993+
991994
def get_multimodal_embeddings(
992995
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
993996
image_input = self._parse_and_validate_image_input(**kwargs)

vllm/model_executor/models/deepseek_vl2.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,9 @@ def _process_image_input(
604604
return self._pixel_values_to_embedding(
605605
pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
606606

607+
def get_language_model(self) -> torch.nn.Module:
608+
return self.language_model
609+
607610
def get_multimodal_embeddings(
608611
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
609612
image_input = self._parse_and_validate_image_input(**kwargs)

vllm/model_executor/models/florence2.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,6 +1050,9 @@ def _process_image_input(
10501050
pixel_values = image_input["data"]
10511051
return self._encode_image(pixel_values)
10521052

1053+
def get_language_model(self) -> torch.nn.Module:
1054+
return self.language_model
1055+
10531056
def get_multimodal_embeddings(
10541057
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
10551058
image_input = self._parse_and_validate_image_input(**kwargs)

vllm/model_executor/models/fuyu.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,9 @@ def _process_image_input(
341341

342342
return vision_embeddings_flat.split(patches_per_image, dim=0)
343343

344+
def get_language_model(self) -> torch.nn.Module:
345+
return self.language_model
346+
344347
def get_multimodal_embeddings(
345348
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
346349
image_input = self._parse_and_validate_image_input(**kwargs)

vllm/model_executor/models/gemma3_mm.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,9 @@ def _process_image_input(
591591
e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
592592
]
593593

594+
def get_language_model(self) -> torch.nn.Module:
595+
return self.language_model
596+
594597
def get_multimodal_embeddings(
595598
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
596599
image_input = self._parse_and_validate_image_input(**kwargs)

vllm/model_executor/models/glm4v.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,9 @@ def _process_image_input(
596596

597597
return self.transformer.vision(pixel_values)
598598

599+
def get_language_model(self) -> torch.nn.Module:
600+
return self.transformer
601+
599602
def get_multimodal_embeddings(
600603
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
601604
image_input = self._parse_and_validate_image_input(**kwargs)

0 commit comments

Comments
 (0)