3333import torch .nn .functional as F
3434from transformers import BatchFeature
3535from transformers .models .qwen2_vl import Qwen2VLImageProcessorFast
36- from transformers .models .qwen2_vl .image_processing_qwen2_vl import smart_resize
36+ from transformers .models .qwen2_vl .image_processing_qwen2_vl import (
37+ smart_resize as image_smart_resize )
3738from transformers .models .qwen3_vl import (Qwen3VLProcessor ,
3839 Qwen3VLVideoProcessor )
3940from transformers .models .qwen3_vl .configuration_qwen3_vl import (
4041 Qwen3VLConfig , Qwen3VLVisionConfig )
42+ from transformers .models .qwen3_vl .video_processing_qwen3_vl import (
43+ smart_resize as video_smart_resize )
4144from transformers .video_utils import VideoMetadata
4245
4346from vllm .attention .layer import check_upstream_fa_availability
8487
8588logger = init_logger (__name__ )
8689
90+ # Official recommended max pixels is 24576 * 32 * 32
91+ _MAX_FRAMES_PER_VIDEO = 24576
92+
8793
8894class Qwen3_VisionPatchEmbed (nn .Module ):
8995
@@ -592,24 +598,39 @@ def _get_vision_info(
592598 image_height : int ,
593599 num_frames : int = 2 ,
594600 do_resize : bool = True ,
595- image_processor : Optional [Qwen2VLImageProcessorFast ],
601+ image_processor : Optional [Union [Qwen2VLImageProcessorFast ,
602+ Qwen3VLVideoProcessor ]],
596603 ) -> tuple [ImageSize , int ]:
597- if image_processor is None :
604+ if image_processor is None and num_frames > 1 :
605+ image_processor = self .get_video_processor ()
606+ elif image_processor is None :
598607 image_processor = self .get_image_processor ()
599608
609+ is_video = isinstance (image_processor , Qwen3VLVideoProcessor )
610+
600611 hf_config = self .get_hf_config ()
601612 vision_config = hf_config .vision_config
602613 patch_size = vision_config .patch_size
603614 merge_size = vision_config .spatial_merge_size
604615 temporal_patch_size = vision_config .temporal_patch_size
605616
606617 if do_resize :
618+ if is_video :
619+ smart_resize = video_smart_resize
620+ extra_kwargs = {
621+ "num_frames" : num_frames ,
622+ "temporal_factor" : temporal_patch_size
623+ }
624+ else :
625+ smart_resize = image_smart_resize
626+ extra_kwargs = {}
607627 resized_height , resized_width = smart_resize (
608628 height = image_height ,
609629 width = image_width ,
610630 factor = patch_size * merge_size ,
611631 min_pixels = image_processor .size ["shortest_edge" ],
612632 max_pixels = image_processor .size ["longest_edge" ],
633+ ** extra_kwargs ,
613634 )
614635 preprocessed_size = ImageSize (width = resized_width ,
615636 height = resized_height )
@@ -628,6 +649,39 @@ def _get_vision_info(
628649
629650 return preprocessed_size , num_vision_tokens
630651
652+ def _get_max_video_frames (self ,
653+ max_tokens : int ,
654+ start_num_frames : int = 2 ) -> int :
655+ return super ()._get_max_video_frames (max_tokens ,
656+ start_num_frames = start_num_frames )
657+
658+ def get_num_frames_with_most_features (
659+ self ,
660+ seq_len : int ,
661+ mm_counts : Mapping [str , int ],
662+ ) -> int :
663+ return super ().get_num_frames_with_most_features (
664+ seq_len , mm_counts , max_frames_per_video = _MAX_FRAMES_PER_VIDEO )
665+
666+ def get_max_video_tokens (
667+ self ,
668+ seq_len : int ,
669+ mm_counts : Mapping [str , int ],
670+ ) -> int :
671+ target_width , target_height = self .get_image_size_with_most_features ()
672+ video_soft_tokens = self .get_num_video_tokens (
673+ image_width = target_width ,
674+ image_height = target_height ,
675+ num_frames = self .get_num_frames_with_most_features (
676+ seq_len , mm_counts ),
677+ image_processor = None ,
678+ )
679+
680+ # NOTE: By default in Qwen3-VL, one video token is converted to
681+ # "<{timestamp} seconds>" (on average 9.5 tokens) + vision_start_token + video_token + vision_end_token # noqa: E501
682+ formatted_video_soft_tokens = video_soft_tokens * 12.5
683+ return int (formatted_video_soft_tokens )
684+
631685 def _calculate_timestamps (self , indices : list [int ] | torch .Tensor ,
632686 video_fps : float , merge_size : int ):
633687 if not isinstance (indices , list ):
@@ -697,15 +751,21 @@ def get_dummy_mm_data(
697751 self .info .get_image_size_with_most_features ())
698752 target_num_frames = self .info .get_num_frames_with_most_features (
699753 seq_len , mm_counts )
754+ target_video_size , _ = self .info ._get_vision_info (
755+ image_width = target_width ,
756+ image_height = target_height ,
757+ num_frames = target_num_frames ,
758+ image_processor = self .info .get_video_processor (),
759+ )
700760 return {
701761 "image" :
702762 self ._get_dummy_images (width = target_width ,
703763 height = target_height ,
704764 num_images = num_images ),
705765 "video" :
706766 self ._get_dummy_videos (
707- width = target_width ,
708- height = target_height ,
767+ width = target_video_size . width ,
768+ height = target_video_size . height ,
709769 num_frames = target_num_frames ,
710770 num_videos = num_videos ,
711771 ),
0 commit comments