File tree Expand file tree Collapse file tree 3 files changed +50
-1
lines changed Expand file tree Collapse file tree 3 files changed +50
-1
lines changed Original file line number Diff line number Diff line change @@ -823,6 +823,14 @@ def get_image_processor(
823823 def get_supported_mm_limits (self ) -> Mapping [str , Optional [int ]]:
824824 return {"image" : None , "video" : None }
825825
826+ def get_max_tokens_per_item (
827+ self , seq_len : int ,
828+ mm_counts : Mapping [str , int ]) -> Optional [Mapping [str , int ]]:
829+
830+ max_image_tokens = self .get_max_image_tokens ()
831+ max_video_tokens = self .get_max_video_tokens (seq_len , mm_counts )
832+ return {"image" : max_image_tokens , "video" : max_video_tokens }
833+
826834 def _get_vision_info (
827835 self ,
828836 * ,
Original file line number Diff line number Diff line change @@ -1100,6 +1100,27 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
11001100
11011101 return allowed_limits
11021102
1103+ def get_max_tokens_per_item (
1104+ self , seq_len : int ,
1105+ mm_counts : Optional [Mapping [str ,
1106+ int ]]) -> Optional [Mapping [str , int ]]:
1107+ """Return the maximum number of tokens per item of for each modality.
1108+ By default, returns `None`. When `None` is returned, vLLM will generate
1109+ dummy inputs (images/videos) at maximum possible sizes and process them
1110+ to determine the maximum token count per modality.
1111+ This approach works but can be very slow for certain models (e.g.,
1112+ Qwen2.5-VL), leading to very long startup time. For better performance,
1113+ each model can override this method to return pre-computed maximum token
1114+ counts, avoiding the need for dummy input generation and processing.
1115+
1116+ NOTE: The maximum number of tokens per item of each modality returned
1117+ from this function should respect to the model maximum sequence length
1118+ and the maximum number of items of each modality allowed, and agrees
1119+ with dummy inputs (images/videos) at maximum possible sizes.
1120+
1121+ """
1122+ return None
1123+
11031124
11041125_I = TypeVar ("_I" , bound = BaseProcessingInfo )
11051126
Original file line number Diff line number Diff line change @@ -253,6 +253,26 @@ def get_mm_max_tokens(
253253 seq_len : int ,
254254 mm_counts : Optional [Mapping [str , int ]] = None ,
255255 ) -> Mapping [str , int ]:
256- mm_inputs = self ._get_dummy_mm_inputs (seq_len , mm_counts )
256+ max_tokens_per_item = self .processing_info .get_max_tokens_per_item (
257+ seq_len = seq_len , mm_counts = mm_counts )
258+ if max_tokens_per_item is not None :
259+ if mm_counts is None :
260+ total_mm_tokens = sum (max_tokens_per_item .values ())
261+ else :
262+ total_mm_tokens = sum (max_tokens_per_item [k ] * mm_counts [k ]
263+ for k in max_tokens_per_item .keys ()
264+ & mm_counts .keys ())
265+ if total_mm_tokens > seq_len :
266+ logger .warning_once (
267+ "The sequence length (%d) is smaller than the pre-defined"
268+ " wosrt-case total number of multimodal tokens (%d). "
269+ "This may cause certain multi-modal inputs to fail during "
270+ "inference. To avoid this, you should increase "
271+ "`max_model_len` or reduce `mm_counts`." ,
272+ seq_len ,
273+ total_mm_tokens ,
274+ )
275+ return max_tokens_per_item
257276
277+ mm_inputs = self ._get_dummy_mm_inputs (seq_len , mm_counts )
258278 return self ._get_mm_num_tokens (mm_inputs )
You can’t perform that action at this time.
0 commit comments