Skip to content

Commit ee0804e

Browse files
committed
introduced 'fps' to base video backend. removed nemotron backend
Signed-off-by: Natan Bagrov <nbagrov@nvidia.com>
1 parent 09f8c57 commit ee0804e

File tree

1 file changed

+20
-141
lines changed

1 file changed

+20
-141
lines changed

vllm/multimodal/video.py

Lines changed: 20 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def load_bytes(
106106
cls,
107107
data: bytes,
108108
num_frames: int = -1,
109+
fps: int = -1,
109110
**kwargs,
110111
) -> tuple[npt.NDArray, dict[str, Any]]:
111112
import cv2
@@ -119,14 +120,20 @@ def load_bytes(
119120
original_fps = cap.get(cv2.CAP_PROP_FPS)
120121
duration = total_frames_num / original_fps if original_fps > 0 else 0
121122

122-
# resample video to target num_frames
123-
full_read = num_frames == -1 or total_frames_num < num_frames
124-
if full_read:
125-
num_frames = total_frames_num
126-
frame_idx = list(range(0, num_frames))
123+
# resample video to target num_frames and fps
124+
# - the minimum of the two will be used
125+
num_frames_to_sample = total_frames_num
126+
if num_frames > 0:
127+
num_frames_to_sample = min(num_frames, total_frames_num)
128+
if fps > 0:
129+
num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
130+
num_frames_to_sample = max(1, num_frames_to_sample) # at least one sample
131+
132+
if num_frames_to_sample == total_frames_num:
133+
frame_idx = list(range(0, num_frames_to_sample))
127134
else:
128135
uniform_sampled_frames = np.linspace(
129-
0, total_frames_num - 1, num_frames, dtype=int
136+
0, total_frames_num - 1, num_frames_to_sample, dtype=int
130137
)
131138
frame_idx = uniform_sampled_frames.tolist()
132139

@@ -135,7 +142,7 @@ def load_bytes(
135142
frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
136143

137144
i = 0
138-
for idx in range(total_frames_num):
145+
for idx in range(max(frame_idx) + 1):
139146
ok = cap.grab()
140147
if not ok:
141148
break
@@ -145,23 +152,23 @@ def load_bytes(
145152
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
146153
i += 1
147154

148-
assert i == num_frames, (
149-
f"Expected reading {num_frames} frames, "
155+
assert i == num_frames_to_sample, (
156+
f"Expected reading {num_frames_to_sample} frames, "
150157
f"but only loaded {i} frames from video."
151158
)
152159

153160
# Use transformers transformers.video_utils.VideoMetadata format
154161
# NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
155162
# can cause incorrect timestamp calculation without num_frames=-1.
156163
metadata = {
157-
"total_num_frames": num_frames,
158-
"fps": num_frames / duration,
164+
"total_num_frames": total_frames_num,
165+
"fps": original_fps,
159166
"duration": duration,
160167
"video_backend": "opencv",
161-
"frames_indices": list(range(num_frames)),
168+
"frames_indices": list(frame_idx),
162169
# extra field used to control hf processor's video
163170
# sampling behavior
164-
"do_sample_frames": num_frames == total_frames_num,
171+
"do_sample_frames": num_frames_to_sample == total_frames_num,
165172
}
166173

167174
return frames, metadata
@@ -250,134 +257,6 @@ def load_bytes(
250257
return frames, metadata
251258

252259

253-
@VIDEO_LOADER_REGISTRY.register("opencv_nemotron_vl_v2")
254-
class OpenCVNemotronVideoBackend(OpenCVVideoBackend):
255-
@classmethod
256-
def _get_frame_indices_to_sample(
257-
cls,
258-
total_frames_num: int,
259-
max_num_frames_to_sample: int,
260-
fps: int,
261-
duration_seconds: float,
262-
**kwargs,
263-
) -> list[int]:
264-
# Determine target number of samples:
265-
max_samples = total_frames_num
266-
if max_num_frames_to_sample > 0: # Hard upper bound
267-
max_samples = min(max_num_frames_to_sample, max_samples)
268-
if fps > 0: # If fps is provided, use it to limit the number of samples
269-
max_samples = min(max_samples, math.floor(duration_seconds * fps))
270-
max_samples = max(1, max_samples) # to make sure we have at least one sample
271-
272-
# Uniform coverage of the entire timeline within the cap
273-
# Use linspace over [0, total_frames-1]
274-
raw = np.linspace(0, total_frames_num - 1, max_samples, endpoint=True)
275-
return np.unique(raw.round().astype(int)).tolist()
276-
277-
@classmethod
278-
def _sample_frames_from_video(
279-
cls,
280-
cap,
281-
frame_indices: list[int],
282-
allow_missing_frames: bool = False,
283-
) -> tuple[npt.NDArray, list[int]]:
284-
import cv2
285-
286-
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
287-
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
288-
frames = np.full((len(frame_indices), height, width, 3), 255, dtype=np.uint8)
289-
290-
i = 0
291-
for idx in range(max(frame_indices) + 1):
292-
ok = cap.grab()
293-
if not ok:
294-
break
295-
if idx in frame_indices:
296-
ret, frame = cap.retrieve()
297-
if ret:
298-
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
299-
i += 1
300-
301-
if not allow_missing_frames and i != len(frame_indices):
302-
raise ValueError(
303-
f"Expected reading {len(frame_indices)} frames, "
304-
f"but only loaded {i} frames from video."
305-
)
306-
307-
return frames[:i], frame_indices[:i]
308-
309-
@classmethod
310-
def load_bytes(
311-
cls,
312-
data: bytes,
313-
num_frames: int = -1,
314-
fps: int = -1,
315-
**kwargs,
316-
) -> tuple[npt.NDArray, dict[str, Any]]:
317-
"""
318-
Args:
319-
num_frames (int): Maximum number of frames to load. A
320-
total sampled number of frames will never be larger
321-
than this value. Set it -1 to remove the upper limit.
322-
323-
fps (int): Desired video sampling rate. A real samping
324-
rate may be lower if we encounter long video and
325-
num_frames upper limit is set to positive value.
326-
"""
327-
import cv2
328-
329-
backend = cls().get_cv2_video_api()
330-
cap = cv2.VideoCapture(BytesIO(data), backend, [])
331-
if not cap.isOpened():
332-
raise ValueError("Could not open video stream")
333-
334-
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
335-
if total_frames_num == 0:
336-
raise ValueError("CAP_PROP_FRAME_COUNT returned 0")
337-
338-
original_fps = float(cap.get(cv2.CAP_PROP_FPS))
339-
if not (original_fps > 0):
340-
logger.warning(
341-
"CAP_PROP_FPS returned value %f. "
342-
"We will use 30 FPS as default fallback.",
343-
original_fps,
344-
)
345-
original_fps = 30.0
346-
347-
duration = float(total_frames_num / original_fps)
348-
349-
frame_indices = cls._get_frame_indices_to_sample(
350-
total_frames_num, num_frames, fps, duration
351-
)
352-
353-
effective_fps = len(frame_indices) / duration
354-
logger.debug(
355-
"Video [%d frames](%.2f sec at %.2f fps) sampled "
356-
"into frame [%d] indexes at %.2f fps.",
357-
total_frames_num,
358-
duration,
359-
original_fps,
360-
len(frame_indices),
361-
effective_fps,
362-
)
363-
364-
frames, frame_indices = cls._sample_frames_from_video(
365-
cap, frame_indices, allow_missing_frames=True
366-
)
367-
368-
# Use transformers transformers.video_utils.VideoMetadata format
369-
metadata = {
370-
"total_num_frames": total_frames_num,
371-
"fps": original_fps,
372-
"duration": duration,
373-
"video_backend": "opencv_nemotron_vl_v2",
374-
"frames_indices": frame_indices,
375-
"do_sample_frames": False,
376-
}
377-
378-
return frames, metadata
379-
380-
381260
class VideoMediaIO(MediaIO[npt.NDArray]):
382261
def __init__(
383262
self,

0 commit comments

Comments
 (0)