@@ -106,6 +106,7 @@ def load_bytes(
106106 cls ,
107107 data : bytes ,
108108 num_frames : int = - 1 ,
109+ fps : int = - 1 ,
109110 ** kwargs ,
110111 ) -> tuple [npt .NDArray , dict [str , Any ]]:
111112 import cv2
@@ -119,14 +120,20 @@ def load_bytes(
119120 original_fps = cap .get (cv2 .CAP_PROP_FPS )
120121 duration = total_frames_num / original_fps if original_fps > 0 else 0
121122
122- # resample video to target num_frames
123- full_read = num_frames == - 1 or total_frames_num < num_frames
124- if full_read :
125- num_frames = total_frames_num
126- frame_idx = list (range (0 , num_frames ))
123+ # resample video to target num_frames and fps
124+ # - the minimum of the two will be used
125+ num_frames_to_sample = total_frames_num
126+ if num_frames > 0 :
127+ num_frames_to_sample = min (num_frames , total_frames_num )
128+ if fps > 0 :
129+ num_frames_to_sample = min (num_frames_to_sample , math .floor (duration * fps ))
130+ num_frames_to_sample = max (1 , num_frames_to_sample ) # at least one sample
131+
132+ if num_frames_to_sample == total_frames_num :
133+ frame_idx = list (range (0 , num_frames_to_sample ))
127134 else :
128135 uniform_sampled_frames = np .linspace (
129- 0 , total_frames_num - 1 , num_frames , dtype = int
136+ 0 , total_frames_num - 1 , num_frames_to_sample , dtype = int
130137 )
131138 frame_idx = uniform_sampled_frames .tolist ()
132139
@@ -135,7 +142,7 @@ def load_bytes(
135142 frames = np .empty ((len (frame_idx ), height , width , 3 ), dtype = np .uint8 )
136143
137144 i = 0
138- for idx in range (total_frames_num ):
145+ for idx in range (max ( frame_idx ) + 1 ):
139146 ok = cap .grab ()
140147 if not ok :
141148 break
@@ -145,23 +152,23 @@ def load_bytes(
145152 frames [i ] = cv2 .cvtColor (frame , cv2 .COLOR_BGR2RGB )
146153 i += 1
147154
148- assert i == num_frames , (
149- f"Expected reading { num_frames } frames, "
155+ assert i == num_frames_to_sample , (
156+ f"Expected reading { num_frames_to_sample } frames, "
150157 f"but only loaded { i } frames from video."
151158 )
152159
153160 # Use transformers transformers.video_utils.VideoMetadata format
154161 # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
155162 # can cause incorrect timestamp calculation without num_frames=-1.
156163 metadata = {
157- "total_num_frames" : num_frames ,
158- "fps" : num_frames / duration ,
164+ "total_num_frames" : total_frames_num ,
165+ "fps" : original_fps ,
159166 "duration" : duration ,
160167 "video_backend" : "opencv" ,
161- "frames_indices" : list (range ( num_frames ) ),
168+ "frames_indices" : list (frame_idx ),
162169 # extra field used to control hf processor's video
163170 # sampling behavior
164- "do_sample_frames" : num_frames == total_frames_num ,
171+ "do_sample_frames" : num_frames_to_sample == total_frames_num ,
165172 }
166173
167174 return frames , metadata
@@ -250,134 +257,6 @@ def load_bytes(
250257 return frames , metadata
251258
252259
253- @VIDEO_LOADER_REGISTRY .register ("opencv_nemotron_vl_v2" )
254- class OpenCVNemotronVideoBackend (OpenCVVideoBackend ):
255- @classmethod
256- def _get_frame_indices_to_sample (
257- cls ,
258- total_frames_num : int ,
259- max_num_frames_to_sample : int ,
260- fps : int ,
261- duration_seconds : float ,
262- ** kwargs ,
263- ) -> list [int ]:
264- # Determine target number of samples:
265- max_samples = total_frames_num
266- if max_num_frames_to_sample > 0 : # Hard upper bound
267- max_samples = min (max_num_frames_to_sample , max_samples )
268- if fps > 0 : # If fps is provided, use it to limit the number of samples
269- max_samples = min (max_samples , math .floor (duration_seconds * fps ))
270- max_samples = max (1 , max_samples ) # to make sure we have at least one sample
271-
272- # Uniform coverage of the entire timeline within the cap
273- # Use linspace over [0, total_frames-1]
274- raw = np .linspace (0 , total_frames_num - 1 , max_samples , endpoint = True )
275- return np .unique (raw .round ().astype (int )).tolist ()
276-
277- @classmethod
278- def _sample_frames_from_video (
279- cls ,
280- cap ,
281- frame_indices : list [int ],
282- allow_missing_frames : bool = False ,
283- ) -> tuple [npt .NDArray , list [int ]]:
284- import cv2
285-
286- width = int (cap .get (cv2 .CAP_PROP_FRAME_WIDTH ))
287- height = int (cap .get (cv2 .CAP_PROP_FRAME_HEIGHT ))
288- frames = np .full ((len (frame_indices ), height , width , 3 ), 255 , dtype = np .uint8 )
289-
290- i = 0
291- for idx in range (max (frame_indices ) + 1 ):
292- ok = cap .grab ()
293- if not ok :
294- break
295- if idx in frame_indices :
296- ret , frame = cap .retrieve ()
297- if ret :
298- frames [i ] = cv2 .cvtColor (frame , cv2 .COLOR_BGR2RGB )
299- i += 1
300-
301- if not allow_missing_frames and i != len (frame_indices ):
302- raise ValueError (
303- f"Expected reading { len (frame_indices )} frames, "
304- f"but only loaded { i } frames from video."
305- )
306-
307- return frames [:i ], frame_indices [:i ]
308-
309- @classmethod
310- def load_bytes (
311- cls ,
312- data : bytes ,
313- num_frames : int = - 1 ,
314- fps : int = - 1 ,
315- ** kwargs ,
316- ) -> tuple [npt .NDArray , dict [str , Any ]]:
317- """
318- Args:
319- num_frames (int): Maximum number of frames to load. A
320- total sampled number of frames will never be larger
321- than this value. Set it -1 to remove the upper limit.
322-
323- fps (int): Desired video sampling rate. A real samping
324- rate may be lower if we encounter long video and
325- num_frames upper limit is set to positive value.
326- """
327- import cv2
328-
329- backend = cls ().get_cv2_video_api ()
330- cap = cv2 .VideoCapture (BytesIO (data ), backend , [])
331- if not cap .isOpened ():
332- raise ValueError ("Could not open video stream" )
333-
334- total_frames_num = int (cap .get (cv2 .CAP_PROP_FRAME_COUNT ))
335- if total_frames_num == 0 :
336- raise ValueError ("CAP_PROP_FRAME_COUNT returned 0" )
337-
338- original_fps = float (cap .get (cv2 .CAP_PROP_FPS ))
339- if not (original_fps > 0 ):
340- logger .warning (
341- "CAP_PROP_FPS returned value %f. "
342- "We will use 30 FPS as default fallback." ,
343- original_fps ,
344- )
345- original_fps = 30.0
346-
347- duration = float (total_frames_num / original_fps )
348-
349- frame_indices = cls ._get_frame_indices_to_sample (
350- total_frames_num , num_frames , fps , duration
351- )
352-
353- effective_fps = len (frame_indices ) / duration
354- logger .debug (
355- "Video [%d frames](%.2f sec at %.2f fps) sampled "
356- "into frame [%d] indexes at %.2f fps." ,
357- total_frames_num ,
358- duration ,
359- original_fps ,
360- len (frame_indices ),
361- effective_fps ,
362- )
363-
364- frames , frame_indices = cls ._sample_frames_from_video (
365- cap , frame_indices , allow_missing_frames = True
366- )
367-
368- # Use transformers transformers.video_utils.VideoMetadata format
369- metadata = {
370- "total_num_frames" : total_frames_num ,
371- "fps" : original_fps ,
372- "duration" : duration ,
373- "video_backend" : "opencv_nemotron_vl_v2" ,
374- "frames_indices" : frame_indices ,
375- "do_sample_frames" : False ,
376- }
377-
378- return frames , metadata
379-
380-
381260class VideoMediaIO (MediaIO [npt .NDArray ]):
382261 def __init__ (
383262 self ,
0 commit comments