@@ -16,7 +16,7 @@ Further update the model as follows:
1616 ...
1717
1818 @classmethod
19- def get_placeholder_str(cls, modality: str, i: int) -> Optional[ str] :
19+ def get_placeholder_str(cls, modality: str, i: int) -> str | None :
2020 if modality.startswith("image"):
2121 return "<image>"
2222
@@ -45,14 +45,14 @@ Further update the model as follows:
4545 ...
4646
4747 def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
48-
4948 assert self.vision_encoder is not None
5049 image_features = self.vision_encoder(image_input)
5150 return self.multi_modal_projector(image_features)
5251
5352 def get_multimodal_embeddings(
54- self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
55-
53+ self,
54+ **kwargs: object,
55+ ) -> MultiModalEmbeddings | None:
5656 # Validate the multimodal input keyword arguments
5757 image_input = self._parse_and_validate_image_input(**kwargs)
5858 if image_input is None:
@@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
110110For example, if the model supports any number of images but only one video per prompt:
111111
112112``` python
113- def get_supported_mm_limits (self ) -> Mapping[str , Optional[ int ] ]:
113+ def get_supported_mm_limits (self ) -> Mapping[str , int | None ]:
114114 return {" image" : None , " video" : 1 }
115115```
116116
@@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
258258 self,
259259 seq_len: int,
260260 mm_counts: Mapping[str, int],
261- mm_options: Optional[ Mapping[str, BaseDummyOptions]] = None,
261+ mm_options: Mapping[str, BaseDummyOptions] | None = None,
262262 ) -> MultiModalDataDict:
263263 num_images = mm_counts.get("image", 0)
264264
@@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
421421 ```python
422422 def get_image_size_with_most_features(self) -> ImageSize:
423423 image_processor = self.get_image_processor()
424- return ImageSize(width=image_processor.size["width"],
425- height=image_processor.size["height"])
424+ return ImageSize(
425+ width=image_processor.size["width"],
426+ height=image_processor.size["height"],
427+ )
426428 ```
427429
428430 Fuyu does not expect image placeholders in the inputs to HF processor, so
@@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
452454
453455 return {
454456 "image":
455- self._get_dummy_images(width=target_width,
456- height=target_height,
457- num_images=num_images,
458- overrides=image_overrides)
457+ self._get_dummy_images(
458+ width=target_width,
459+ height=target_height,
460+ num_images=num_images,
461+ overrides=image_overrides,
462+ )
459463 }
460464 ```
461465
@@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
744748 image_width=image_size.width,
745749 image_height=image_size.height,
746750 )
747- image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
748- [_NEWLINE_TOKEN_ID]) * nrows
751+ image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
749752
750753 return PromptUpdateDetails.select_token_id(
751754 image_tokens + [bos_token_id],
@@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
781784 image_width=image_size.width,
782785 image_height=image_size.height,
783786 )
784- image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
785- [_NEWLINE_TOKEN_ID]) * nrows
787+ image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
786788
787789 return PromptUpdateDetails.select_token_id(
788790 image_tokens + [bos_token_id],
@@ -810,9 +812,11 @@ to register them to the multi-modal registry:
810812 from vllm.model_executor.models.interfaces import SupportsMultiModal
811813+ from vllm.multimodal import MULTIMODAL_REGISTRY
812814
813- + @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
814- + info=YourProcessingInfo,
815- + dummy_inputs=YourDummyInputsBuilder)
815+ + @MULTIMODAL_REGISTRY.register_processor(
816+ + YourMultiModalProcessor,
817+ + info=YourProcessingInfo,
818+ + dummy_inputs=YourDummyInputsBuilder,
819+ + )
816820 class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
817821```
818822
0 commit comments