Fix PerceptionLM image preprocessing for non-tiled image input. (#40006)

shuminghu · web-flow · commit 3ff2e984d2c3 · 2025-08-12T08:40:22.000Z
* Fix PerceptionLM image preprocessing for non-tiled image input.

* Add test for single tile vanilla image processing.

* ruff format

* recover missing test skip

* Simplify test.

* minor test name fix
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -310,7 +310,7 @@ def _preprocess(
             )
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-
+        processed_images = [p[None] if p.ndim == 3 else p for p in processed_images]  # add tiles dimension if needed
         processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
 
diff --git a/tests/models/perception_lm/test_processing_perception_lm.py b/tests/models/perception_lm/test_processing_perception_lm.py
@@ -115,6 +115,36 @@ def test_image_token_filling(self):
         )
         image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
         self.assertEqual(expected_image_tokens, image_tokens)
+        self.assertEqual(inputs["pixel_values"].ndim, 5)
+
+    def test_vanilla_image_with_no_tiles_token_filling(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        processor.image_processor.vision_input_type = "vanilla"
+        # Important to check with non square image
+        image = torch.randn((1, 3, 450, 500))
+        #  1 tile
+        #  448/patch_size/pooling_ratio = 16 => 16*16 tokens per tile
+        expected_image_tokens = 16 * 16 * 1
+        image_token_index = processor.image_token_id
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        inputs = processor(
+            text=[processor.apply_chat_template(messages)],
+            images=[image],
+            return_tensors="pt",
+        )
+        image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
+        self.assertEqual(expected_image_tokens, image_tokens)
+        self.assertEqual(inputs["pixel_values"].ndim, 5)
+        self.assertEqual(inputs["pixel_values"].shape[1], 1)  # 1 tile
 
 
 CHAT_TEMPLATE = (

Original file line number	Diff line number	Diff line change
`@@ -310,7 +310,7 @@ def _preprocess(`
`310`	`310`	`)`
`311`	`311`	`processed_images_grouped[shape] = stacked_images`
`312`	`312`	`processed_images = reorder_images(processed_images_grouped, grouped_images_index)`
`313`		`-`
	`313`	`+ processed_images = [p[None] if p.ndim == 3 else p for p in processed_images] # add tiles dimension if needed`
`314`	`314`	`processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images`
`315`	`315`	`return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)`
`316`	`316`