66from typing import Iterable , Optional , Set , Tuple , Union
77
88import torch
9- from PIL import Image
109from torch import nn
1110from transformers import SiglipVisionConfig
1211
2019from vllm .model_executor .layers .vocab_parallel_embedding import (
2120 VocabParallelEmbedding )
2221from vllm .model_executor .model_loader .weight_utils import default_weight_loader
23- from vllm .multimodal .utils import consecutive_placeholder_ranges
24- from vllm .sequence import SequenceData
2522
2623from .vision import VisionEncoderInfo , resolve_visual_encoder_outputs
2724
2825
29- def get_siglip_patch_grid_length (* , image_size : int , patch_size : int ) -> int :
30- # Since interpolation is applied, the image size need not be divisible
31- # assert image_size % patch_size == 0
32- return image_size // patch_size
33-
34-
35- def get_siglip_num_patches (* , image_size : int , patch_size : int ) -> int :
36- grid_length = get_siglip_patch_grid_length (image_size = image_size ,
37- patch_size = patch_size )
38- return grid_length * grid_length
39-
40-
41- def get_siglip_image_feature_size (hf_config : SiglipVisionConfig ) -> int :
42- return get_siglip_num_patches (image_size = hf_config .image_size ,
43- patch_size = hf_config .patch_size )
44-
45-
46- def get_max_siglip_image_tokens (hf_config : SiglipVisionConfig ) -> int :
47- return get_siglip_image_feature_size (hf_config )
48-
49-
50- def dummy_seq_data_for_siglip (
51- hf_config : SiglipVisionConfig ,
52- seq_len : int ,
53- num_images : int ,
54- * ,
55- image_token_id : int ,
56- image_feature_size_override : Optional [int ] = None ,
57- mm_key : str = "image" ,
58- ):
59- if image_feature_size_override is None :
60- image_feature_size = get_siglip_image_feature_size (hf_config )
61- else :
62- image_feature_size = image_feature_size_override
63-
64- return SequenceData .from_prompt_token_counts (
65- (image_token_id , image_feature_size * num_images ),
66- (0 , seq_len - image_feature_size * num_images ),
67- ), {
68- mm_key :
69- consecutive_placeholder_ranges (num_items = num_images ,
70- item_size = image_feature_size )
71- }
72-
73-
74- def dummy_image_for_siglip (
75- hf_config : SiglipVisionConfig ,
76- num_images : int ,
77- * ,
78- image_width_override : Optional [int ] = None ,
79- image_height_override : Optional [int ] = None ,
80- ):
81- width = height = hf_config .image_size
82- if image_width_override is not None :
83- width = image_width_override
84- if image_height_override is not None :
85- height = image_height_override
86-
87- image = Image .new ("RGB" , (width , height ), color = 0 )
88- return {"image" : image if num_images == 1 else [image ] * num_images }
89-
90-
9126class SiglipEncoderInfo (VisionEncoderInfo [SiglipVisionConfig ]):
9227
9328 def get_num_image_tokens (
@@ -96,10 +31,10 @@ def get_num_image_tokens(
9631 image_width : int ,
9732 image_height : int ,
9833 ) -> int :
99- return get_siglip_image_feature_size ( self .vision_config )
34+ return self .get_patch_grid_length () ** 2
10035
10136 def get_max_image_tokens (self ) -> int :
102- return get_max_siglip_image_tokens ( self .vision_config )
37+ return self .get_patch_grid_length () ** 2
10338
10439 def get_image_size (self ) -> int :
10540 return self .vision_config .image_size
@@ -108,10 +43,8 @@ def get_patch_size(self) -> int:
10843 return self .vision_config .patch_size
10944
11045 def get_patch_grid_length (self ) -> int :
111- return get_siglip_patch_grid_length (
112- image_size = self .vision_config .image_size ,
113- patch_size = self .vision_config .patch_size ,
114- )
46+ image_size , patch_size = self .get_image_size (), self .get_patch_size ()
47+ return image_size // patch_size
11548
11649
11750# Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
0 commit comments