[Glm4.5V] fix vLLM support (#40696)

zucchini-nlp · Cyrilvallez · commit 3ce5629f1c12 · 2025-09-04T22:20:31.000+02:00
* fix

* add a test case
diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py
@@ -453,7 +453,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
         """
         patch_size = images_kwargs.get("patch_size", self.patch_size)
         merge_size = images_kwargs.get("merge_size", self.merge_size)
-        size = images_kwargs.get("size", self.size)
+        size = images_kwargs.get("size", {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000})
 
         factor = patch_size * merge_size
         resized_height, resized_width = smart_resize(
diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py
@@ -305,10 +305,14 @@ def _decode_and_sample_videos(
         # Only sample frames if an array video is passed, otherwise first decode -> then sample
         if is_valid_video(videos[0]) and do_sample_frames:
             sampled_videos = []
+            sampled_metadata = []
             for video, metadata in zip(videos, video_metadata):
                 indices = sample_indices_fn(metadata=metadata)
+                metadata.frames_indices = indices
                 sampled_videos.append(video[indices])
+                sampled_metadata.append(metadata)
             videos = sampled_videos
+            video_metadata = sampled_metadata
         elif not is_valid_video(videos[0]):
             if isinstance(videos[0], list):
                 # Videos sometimes are passed as a list of image URLs, especially through templates
diff --git a/src/transformers/video_utils.py b/src/transformers/video_utils.py
@@ -15,9 +15,9 @@
 
 import os
 import warnings
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping
 from contextlib import redirect_stdout
-from dataclasses import dataclass
+from dataclasses import dataclass, fields
 from io import BytesIO
 from typing import Callable, NewType, Optional, Union
 from urllib.parse import urlparse
@@ -78,7 +78,7 @@
 
 
 @dataclass
-class VideoMetadata:
+class VideoMetadata(Mapping):
     total_num_frames: int
     fps: float = None
     width: int = None
@@ -87,6 +87,12 @@ class VideoMetadata:
     video_backend: str = None
     frames_indices: list[int] = None
 
+    def __iter__(self):
+        return (f.name for f in fields(self))
+
+    def __len__(self):
+        return len(fields(self))
+
     def __getitem__(self, item):
         return getattr(self, item)
 
@@ -96,8 +102,8 @@ def __setitem__(self, key, value):
     @property
     def timestamps(self) -> float:
         "Timestamps of the sampled frames in seconds."
-        if self.fps is None:
-            raise ValueError("Cannot infer video `timestamps` when `fps` is None.")
+        if self.fps is None or self.frames_indices is None:
+            raise ValueError("Cannot infer video `timestamps` when `fps` or `frames_indices` is None.")
         return [frame_idx / self.fps for frame_idx in self.frames_indices]
 
     def update(self, dictionary):
diff --git a/tests/test_video_processing_common.py b/tests/test_video_processing_common.py
@@ -342,6 +342,13 @@ def test_call_sample_frames(self):
             self.assertEqual(encoded_videos.shape[1], 6)
             self.assertEqual(encoded_videos_batched.shape[1], 6)
 
+            # The same as above but uses a `VideoMetadata` object in the input
+            metadata = [[VideoMetadata(duration=2.0, total_num_frames=8, fps=4)]]
+            batched_metadata = metadata * len(video_inputs)
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
+                self.input_name
+            ]
+
             # We should raise error when asked to sample more frames than there are in input video
             with self.assertRaises(ValueError):
                 encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]