Merge pull request huggingface#2 from huggingface/add-image-processor

qubvel · web-flow · commit 631d535bb0f0 · 2025-08-08T13:19:08.000+01:00
Add image processor
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -87,6 +87,7 @@
             ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
             ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("dinov2", ("BitImageProcessor", "BitImageProcessorFast")),
+            ("dinov3_vit", (None, "DINOv3ViTImageProcessorFast")),
             ("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")),
             ("dpt", ("DPTImageProcessor", "DPTImageProcessorFast")),
             ("efficientformer", ("EfficientFormerImageProcessor",)),
diff --git a/src/transformers/models/dinov3_vit/__init__.py b/src/transformers/models/dinov3_vit/__init__.py
@@ -20,6 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_dinov3_vit import *
     from .modeling_dinov3_vit import *
+    from .image_processing_dinov3_vit_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py b/src/transformers/models/dinov3_vit/convert_dinov3_vit_to_hf.py
@@ -3,16 +3,16 @@
 URL: https://github.com/facebookresearch/dinov3/tree/main
 """
 
+import os
 import argparse
-from typing import Optional
 import torch
 
 import random
 import numpy as np
 from torchvision import transforms
 import requests
 from PIL import Image
-from transformers import DINOv3ViTConfig, DINOv3ViTModel
+from transformers import DINOv3ViTConfig, DINOv3ViTModel, DINOv3ViTImageProcessorFast
 from huggingface_hub import hf_hub_download
 
 HUB_MODELS = {
@@ -34,7 +34,7 @@
 }
 
 
-def get_dinov3_config(model_name: str) -> Optional[DINOv3ViTConfig]:
+def get_dinov3_config(model_name: str) -> DINOv3ViTConfig:
     # size of the architecture
     if model_name == "vits":
         return DINOv3ViTConfig(
@@ -149,7 +149,6 @@ def get_dinov3_config(model_name: str) -> Optional[DINOv3ViTConfig]:
     else:
         raise ValueError("Model not supported")
 
-
 def convert_dinov3_vit_to_hf_vit(original_dinov3_state_dict, config: DINOv3ViTConfig):
     embed_dim = config.hidden_size
     hf_dinov3_state_dict = {}
@@ -204,7 +203,7 @@ def prepare_img():
     return image
 
 
-def make_transform(resize_size: int = 224):
+def get_transform(resize_size: int = 224):
     to_tensor = transforms.ToTensor()
     resize = transforms.Resize((resize_size, resize_size), antialias=True)
     normalize = transforms.Normalize(
@@ -213,6 +212,12 @@ def make_transform(resize_size: int = 224):
     )
     return transforms.Compose([to_tensor, resize, normalize])
 
+def get_image_processor(resize_size: int = 224):
+    return DINOv3ViTImageProcessorFast(
+        do_resize=True,
+        size={"height": resize_size, "width": resize_size},
+        resample=2,  # BILINEAR
+    )
 
 def set_deterministic(seed=42):
     random.seed(seed)
@@ -230,7 +235,7 @@ def set_deterministic(seed=42):
 
 
 @torch.no_grad()
-def convert_and_test_dinov3_checkpoint(model_name):
+def convert_and_test_dinov3_checkpoint(args):
     expected_outputs = {
         "vits_cls": [
             0.4635618329048157,
@@ -317,6 +322,7 @@ def convert_and_test_dinov3_checkpoint(model_name):
             -0.026546532288193703,
         ],
     }
+    model_name = args.model_name
     config = get_dinov3_config(model_name)
     print(config)
 
@@ -330,35 +336,47 @@ def convert_and_test_dinov3_checkpoint(model_name):
     model.load_state_dict(hf_state_dict, strict=True)
     model = model.eval()
 
-    image_preprocessor = make_transform()
-    # load image
-    images = [image_preprocessor(prepare_img())]
-    image_tensor = torch.stack(images, dim=0)
-    with torch.inference_mode():
-        with torch.autocast("cuda", dtype=torch.float):
-            model_output = model(image_tensor)
+    transform = get_transform()
+    image_processor = get_image_processor()
+    image = prepare_img()
+
+    # check preprocessing
+    original_pixel_values = transform(image).unsqueeze(0) # add batch dimension
+    inputs = image_processor(image, return_tensors="pt")
+
+    torch.testing.assert_close(original_pixel_values, inputs["pixel_values"], atol=1e-6, rtol=1e-6)
+    print("Preprocessing looks ok!")
+    
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float):
+        model_output = model(**inputs)
 
     last_layer_class_token = model_output.pooler_output
-    last_layer_patch_tokens = model_output.last_hidden_state[
-        :, config.num_register_tokens + 1 :
-    ]
+    last_layer_patch_tokens = model_output.last_hidden_state[:, config.num_register_tokens + 1:]
+
     actual_outputs = {}
     actual_outputs[f"{model_name}_cls"] = last_layer_class_token[0, :5].tolist()
     actual_outputs[f"{model_name}_patch"] = last_layer_patch_tokens[0, 0, :5].tolist()
-    print(actual_outputs[f"{model_name}_cls"], expected_outputs[f"{model_name}_cls"])
+
+    print("Actual:  ", actual_outputs[f"{model_name}_cls"])
+    print("Expected:", expected_outputs[f"{model_name}_cls"])
+
     torch.testing.assert_close(
         torch.Tensor(actual_outputs[f"{model_name}_cls"]),
         torch.Tensor(expected_outputs[f"{model_name}_cls"]),
-        atol=1e-3,
-        rtol=1e-3,
+        atol=1e-4, rtol=1e-4,
     )
     torch.testing.assert_close(
         torch.Tensor(actual_outputs[f"{model_name}_patch"]),
         torch.Tensor(expected_outputs[f"{model_name}_patch"]),
-        atol=1e-3,
-        rtol=1e-3,
+        atol=1e-4, rtol=1e-4,
     )
-    print("Looks ok!")
+    print("Forward pass looks ok!")
+
+    save_dir = os.path.join(args.save_dir, model_name)
+    os.makedirs(save_dir, exist_ok=True)
+    model.save_pretrained(save_dir)
+    image_processor.save_pretrained(save_dir)
+    print(f"Model saved to {save_dir}")
 
 
 if __name__ == "__main__":
@@ -371,5 +389,11 @@ def convert_and_test_dinov3_checkpoint(model_name):
         choices=["vits", "vitsplus", "vitb", "vitl", "vithplus", "vit7b"],
         help="Name of the model you'd like to convert.",
     )
+    parser.add_argument(
+        "--save-dir",
+        default="converted_models",
+        type=str,
+        help="Directory to save the converted model.",
+    )
     args = parser.parse_args()
-    convert_and_test_dinov3_checkpoint(args.model_name)
+    convert_and_test_dinov3_checkpoint(args)
diff --git a/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py b/src/transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py
@@ -0,0 +1,102 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for DINOv3."""
+
+from typing import Optional, Union
+
+from transformers.image_processing_base import BatchFeature
+from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling, SizeDict
+from transformers.utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+from transformers.utils.import_utils import requires
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_v2_available():
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.transforms import functional as F
+
+
+@auto_docstring
+@requires(backends=("torchvision", "torch"))
+class DINOv3ViTImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    size = {"height": 224, "width": 224}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+
+    # Overriden for DINOv3 to preserve order of transforms
+    # rescale -> resize -> normalize
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_rescale:
+                stacked_images = self.rescale(stacked_images, rescale_factor)
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation, antialias=True)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            if do_normalize:
+                stacked_images = self.normalize(stacked_images, image_mean, image_std)
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["DINOv3ViTImageProcessorFast"]