From 221f587f6c5aff30d5fb9bbf60a433b820eeccc7 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Mon, 6 Oct 2025 15:59:34 +0000
Subject: [PATCH 1/4] set common_kwargs defaults before updating with kwargs

---
 src/transformers/processing_utils.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index e7786d1ba61d..6b8ff98e0e3c 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1305,6 +1305,13 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         # pass defaults to output dictionary
         output_kwargs.update(default_kwargs)
 
+        # For `common_kwargs` just update all modality-specific kwargs with same key/values
+        common_kwargs = kwargs.get("common_kwargs", {})
+        common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {}))
+        if common_kwargs:
+            for kwarg in output_kwargs.values():
+                kwarg.update(common_kwargs)
+
         # update modality kwargs with passed kwargs
         non_modality_kwargs = set(kwargs) - set(output_kwargs)
         for modality, output_kwarg in output_kwargs.items():
@@ -1354,13 +1361,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                         f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
                     )
 
-        # For `common_kwargs` just update all modality-specific kwargs with same key/values
-        common_kwargs = kwargs.get("common_kwargs", {})
-        common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {}))
-        if common_kwargs:
-            for kwarg in output_kwargs.values():
-                kwarg.update(common_kwargs)
-
         return output_kwargs
 
     @classmethod

From e76ddae4dbbc68037db752b82a2bdb478691f11b Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Mon, 6 Oct 2025 19:06:50 +0000
Subject: [PATCH 2/4] modify auto image procesor logic and make sure all
 processors use AutoImageProcessor

---
 src/transformers/image_processing_utils.py         |  6 ++++++
 src/transformers/models/align/processing_align.py  |  4 ++--
 .../models/auto/image_processing_auto.py           | 14 ++------------
 .../models/bridgetower/processing_bridgetower.py   |  8 ++++----
 .../models/chameleon/processing_chameleon.py       |  6 +++---
 src/transformers/models/emu3/processing_emu3.py    |  6 +++---
 src/transformers/models/flava/processing_flava.py  |  6 +++---
 src/transformers/models/fuyu/processing_fuyu.py    |  6 +++---
 .../grounding_dino/processing_grounding_dino.py    | 10 +++++-----
 .../models/idefics/processing_idefics.py           |  6 +++---
 .../models/idefics2/processing_idefics2.py         |  6 +++---
 .../models/idefics3/processing_idefics3.py         |  6 +++---
 src/transformers/models/janus/processing_janus.py  |  6 +++---
 .../models/layoutlmv2/processing_layoutlmv2.py     | 14 +++++++-------
 .../models/layoutlmv3/processing_layoutlmv3.py     | 12 ++++++------
 .../models/layoutxlm/processing_layoutxlm.py       | 14 +++++++-------
 .../models/lfm2_vl/processing_lfm2_vl.py           |  6 +++---
 .../models/mllama/processing_mllama.py             |  6 +++---
 .../models/oneformer/processing_oneformer.py       | 14 +++++++-------
 .../models/owlvit/processing_owlvit.py             |  6 +++---
 .../phi4_multimodal/processing_phi4_multimodal.py  |  6 +++---
 .../models/pix2struct/processing_pix2struct.py     |  6 +++---
 src/transformers/models/sam/processing_sam.py      | 12 ++++++------
 src/transformers/models/sam2/processing_sam2.py    | 12 ++++++------
 .../models/sam2_video/modular_sam2_video.py        | 10 +++++-----
 .../models/sam2_video/processing_sam2_video.py     |  8 ++++----
 src/transformers/models/sam_hq/processing_samhq.py | 12 ++++++------
 .../models/siglip2/processing_siglip2.py           |  4 ++--
 .../models/smolvlm/processing_smolvlm.py           |  4 ++--
 src/transformers/models/tvp/processing_tvp.py      |  4 ++--
 src/transformers/models/udop/processing_udop.py    |  8 ++++----
 .../models/video_llava/processing_video_llava.py   |  6 +++---
 src/transformers/models/vilt/processing_vilt.py    |  8 ++++----
 .../models/x_clip/processing_x_clip.py             |  8 ++++----
 34 files changed, 133 insertions(+), 137 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 3227b08cf031..f4d8c1606602 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -41,6 +41,12 @@ class BaseImageProcessor(ImageProcessingMixin):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        if not self.is_fast:
+            logger.warning_once(
+                f"Using a slow image processor (`{self.__class__.__name__}`). "
+                "As we are transitioning to fast (PyTorch-native) processors, consider using `AutoImageProcessor` or the model-specific fast image processor class "
+                "to instantiate a fast image processor."
+            )
 
     @property
     def is_fast(self) -> bool:
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index fbca27b2ff39..41ec7ac6a9a4 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -52,7 +52,7 @@ class AlignProcessor(ProcessorMixin):
         ```
 
     Args:
-        image_processor ([`EfficientNetImageProcessor`]):
+        image_processor ([`AutoImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
             The tokenizer is a required input.
@@ -60,7 +60,7 @@ class AlignProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "EfficientNetImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
     valid_processor_kwargs = AlignProcessorKwargs
 
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index eeea333aa2e8..3684dc17bac9 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -49,9 +49,6 @@
 logger = logging.get_logger(__name__)
 
 
-FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
-
-
 if TYPE_CHECKING:
     # This significantly improves completion suggestion performance when
     # the transformers package is used with Microsoft's Pylance language server.
@@ -519,19 +516,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
             if use_fast is None:
                 use_fast = image_processor_type.endswith("Fast")
-                if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
-                    use_fast = True
+                if not use_fast and is_torchvision_available():
                     logger.warning_once(
                         f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
                         "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
-                        "Note that this behavior will be extended to all models in a future release."
-                    )
-                if not use_fast:
-                    logger.warning_once(
-                        "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
-                        "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
-                        "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
                     )
+                use_fast = True
             if use_fast and not image_processor_type.endswith("Fast"):
                 image_processor_type += "Fast"
             if use_fast and not is_torchvision_available():
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
index 030c578c49cd..95aedf6fe180 100644
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -43,19 +43,19 @@ class BridgeTowerProcessor(ProcessorMixin):
     Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single
     processor.
 
-    [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and
+    [`BridgeTowerProcessor`] offers all the functionalities of [`AutoImageProcessor`] and
     [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and
     [`~BridgeTowerProcessor.decode`] for more information.
 
     Args:
-        image_processor (`BridgeTowerImageProcessor`):
-            An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input.
+        image_processor (`AutoImageProcessor`):
+            An instance of [`AutoImageProcessor`]. The image processor is a required input.
         tokenizer (`RobertaTokenizerFast`):
             An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "BridgeTowerImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
     valid_processor_kwargs = BridgeTowerProcessorKwargs
 
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 247f72322a2d..53dafc0d504e 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -55,11 +55,11 @@ class ChameleonProcessor(ProcessorMixin):
     Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single
     processor.
 
-    [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`].
+    [`ChameleonProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`LlamaTokenizerFast`].
     See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`ChameleonImageProcessor`]):
+        image_processor ([`AutoImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`]):
             The tokenizer is a required input.
@@ -71,7 +71,7 @@ class ChameleonProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
-    image_processor_class = "ChameleonImageProcessor"
+    image_processor_class = "AutoImageProcessor"
 
     def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
         self.image_seq_length = image_seq_length
diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
index b7ed8e9074f0..ea3336336d32 100644
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -52,11 +52,11 @@ class Emu3Processor(ProcessorMixin):
     Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single
     processor.
 
-    [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`].
+    [`Emu3Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`GPT2TokenizerFast`].
     See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information.
 
     Args:
-        image_processor ([`Emu3ImageProcessor`]):
+        image_processor ([`AutoImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`Emu3TokenizerFast`]):
             The tokenizer is a required input.
@@ -66,7 +66,7 @@ class Emu3Processor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
-    image_processor_class = "Emu3ImageProcessor"
+    image_processor_class = "AutoImageProcessor"
 
     def __init__(
         self,
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index 8e8a806e8615..76bba6591175 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -25,16 +25,16 @@ class FlavaProcessor(ProcessorMixin):
     r"""
     Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
 
-    [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the
+    [`FlavaProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`BertTokenizerFast`]. See the
     [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
+        image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input.
         tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "FlavaImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index 75b2bbad926e..f9f64957730e 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -337,18 +337,18 @@ class FuyuProcessor(ProcessorMixin):
     r"""
     Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.
 
-    [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`FuyuProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`LlamaTokenizerFast`]. See the
     [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`FuyuImageProcessor`]):
+        image_processor ([`AutoImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`]):
             The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "FuyuImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer, **kwargs):
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 5f2f900451b2..4e090329869a 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -119,19 +119,19 @@ class GroundingDinoProcessor(ProcessorMixin):
     Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
     single processor.
 
-    [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and
+    [`GroundingDinoProcessor`] offers all the functionalities of [`AutoImageProcessor`] and
     [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`]
     for more information.
 
     Args:
-        image_processor (`GroundingDinoImageProcessor`):
-            An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input.
+        image_processor (`AutoImageProcessor`):
+            An instance of [`AutoImageProcessor`]. The image processor is a required input.
         tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "GroundingDinoImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
     valid_processor_kwargs = GroundingDinoProcessorKwargs
 
@@ -145,7 +145,7 @@ def __call__(
         **kwargs: Unpack[GroundingDinoProcessorKwargs],
     ) -> BatchEncoding:
         """
-        This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
+        This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Args:
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 4b5ccaffe5c8..00344fd9bfd6 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -137,13 +137,13 @@ def is_url(string):
 
 class IdeficsProcessor(ProcessorMixin):
     r"""
-    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
+    Constructs a IDEFICS processor which wraps a LLama tokenizer and AutoImageProcessor into a single processor.
 
     [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
     the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
 
     Args:
-        image_processor (`IdeficsImageProcessor`):
+        image_processor (`AutoImageProcessor`):
             An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
         tokenizer (`LlamaTokenizerFast`):
             An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
@@ -154,7 +154,7 @@ class IdeficsProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "IdeficsImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "LlamaTokenizerFast"
 
     def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index c419a3641254..15954ce8d5d1 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -57,13 +57,13 @@ class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
 
 class Idefics2Processor(ProcessorMixin):
     r"""
-    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.
+    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and AutoImageProcessor into a single processor.
 
     [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
     the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
 
     Args:
-        image_processor (`Idefics2ImageProcessor`):
+        image_processor (`AutoImageProcessor`):
             An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
         tokenizer (`PreTrainedTokenizerBase`, *optional*):
             An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
@@ -76,7 +76,7 @@ class Idefics2Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Idefics2ImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 451af1d8a38f..09acdedb5b76 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -103,13 +103,13 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
 
 class Idefics3Processor(ProcessorMixin):
     r"""
-    Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor.
+    Constructs a Idefics3 processor which wraps a LLama tokenizer and AutoImageProcessor into a single processor.
 
     [`Idefics3Processor`] offers all the functionalities of [`Idefics3ImageProcessor`] and [`Idefics3TokenizerFast`]. See
     the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
 
     Args:
-        image_processor (`Idefics3ImageProcessor`):
+        image_processor (`AutoImageProcessor`):
             An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
         tokenizer (`PreTrainedTokenizerBase`, *optional*):
             An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
@@ -122,7 +122,7 @@ class Idefics3Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Idefics3ImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(
diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py
index 15c237c4ced4..b11e61db8e73 100644
--- a/src/transformers/models/janus/processing_janus.py
+++ b/src/transformers/models/janus/processing_janus.py
@@ -50,11 +50,11 @@ class JanusProcessor(ProcessorMixin):
     r"""
     Constructs a Janus processor which wraps a Janus Image Processor and a Llama tokenizer into a single processor.
 
-    [`JanusProcessor`] offers all the functionalities of [`JanusImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`JanusProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`LlamaTokenizerFast`]. See the
     [`~JanusProcessor.__call__`] and [`~JanusProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`JanusImageProcessor`]):
+        image_processor ([`AutoImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`]):
             The tokenizer is a required input.
@@ -65,7 +65,7 @@ class JanusProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "JanusImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "LlamaTokenizerFast"
 
     def __init__(self, image_processor, tokenizer, chat_template=None, use_default_system_prompt=False, **kwargs):
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index 603cdf4df4e9..5e95461b6265 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -31,21 +31,21 @@ class LayoutLMv2Processor(ProcessorMixin):
 
     [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
 
-    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
+    It first uses [`AutoImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
     get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
     [`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
     `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
+        image_processor (`AutoImageProcessor`, *optional*):
+            An instance of [`AutoImageProcessor`]. The image processor is a required input.
         tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
             An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LayoutLMv2ImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
@@ -86,10 +86,10 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         """
-        This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
-        [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        This method first forwards the `images` argument to [`~AutoImageProcessor.__call__`]. In case
+        [`AutoImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
         bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
+        together with resized `images`. In case [`AutoImageProcessor`] was initialized with `apply_ocr` set to
         `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
         arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
 
diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
index 1f1b6cead607..15ae7a0e3a8f 100644
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -31,21 +31,21 @@ class LayoutLMv3Processor(ProcessorMixin):
 
     [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
 
-    It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
+    It first uses [`AutoImageProcessor`] to resize and normalize document images, and optionally applies OCR to
     get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
     [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
     `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        image_processor (`LayoutLMv3ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
+        image_processor (`AutoImageProcessor`, *optional*):
+            An instance of [`AutoImageProcessor`]. The image processor is a required input.
         tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
             An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LayoutLMv3ImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
@@ -86,8 +86,8 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         """
-        This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
-        [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        This method first forwards the `images` argument to [`~AutoImageProcessor.__call__`]. In case
+        [`AutoImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
         bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
         together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
         `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index e3ece89f434b..bb49e5291519 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -31,21 +31,21 @@ class LayoutXLMProcessor(ProcessorMixin):
 
     [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
 
-    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
+    It first uses [`AutoImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
     get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
     [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
     `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
+        image_processor (`AutoImageProcessor`, *optional*):
+            An instance of [`AutoImageProcessor`]. The image processor is a required input.
         tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
             An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LayoutLMv2ImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
@@ -85,10 +85,10 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         """
-        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
-        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        This method first forwards the `images` argument to [`~AutoImageProcessor.__call__`]. In case
+        [`AutoImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
         bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
+        together with resized `images`. In case [`AutoImageProcessor`] was initialized with `apply_ocr` set to
         `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
         arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
 
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index 12f289c266a1..e69f9a395e7c 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -64,12 +64,12 @@ class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
 
 class Lfm2VlProcessor(ProcessorMixin):
     r"""
-    Constructs a Lfm2Vl processor which wraps a Lfm2Tokenizer tokenizer and Lfm2VlImageProcessor into a single processor.
+    Constructs a Lfm2Vl processor which wraps a Lfm2Tokenizer tokenizer and AutoImageProcessor into a single processor.
 
     [`Lfm2VlProcessor`] offers all the functionalities of [`Lfm2ImageProcessor`] and [`Lfm2Tokenizer`].
 
     Args:
-        image_processor (`Lfm2VlImageProcessor`):
+        image_processor (`AutoImageProcessor`):
              An instance of [`Lfm2VlImageProcessor`]. The image processor is a required input.
         tokenizer (`PreTrainedTokenizerBase`):
             An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
@@ -80,7 +80,7 @@ class Lfm2VlProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Lfm2VlImageProcessorFast"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 3955006a4f9e..eda8160c2594 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -169,7 +169,7 @@ def build_string_from_input(prompt: str, bos_token: str, image_token: str) -> st
 class MllamaProcessor(ProcessorMixin):
     r"""
     Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
-    [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
+    [`AutoTokenizer`] into a single processor that inherits both the image processor and
     tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
     information.
     The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
@@ -189,7 +189,7 @@ class MllamaProcessor(ProcessorMixin):
         ```
 
     Args:
-        image_processor ([`MllamaImageProcessor`]):
+        image_processor ([`AutoImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
             The tokenizer is a required input.
@@ -199,7 +199,7 @@ class MllamaProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "MllamaImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "PreTrainedTokenizerFast"
 
     def __init__(self, image_processor, tokenizer, chat_template=None):
diff --git a/src/transformers/models/oneformer/processing_oneformer.py b/src/transformers/models/oneformer/processing_oneformer.py
index de5a0474e26a..ae269ec0812b 100644
--- a/src/transformers/models/oneformer/processing_oneformer.py
+++ b/src/transformers/models/oneformer/processing_oneformer.py
@@ -31,7 +31,7 @@ class OneFormerProcessor(ProcessorMixin):
     tokenizer functionalities.
 
     Args:
-        image_processor ([`OneFormerImageProcessor`]):
+        image_processor ([`AutoImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
             The tokenizer is a required input.
@@ -42,7 +42,7 @@ class OneFormerProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "OneFormerImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(
@@ -74,7 +74,7 @@ def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwar
         Main method to prepare for the model one or several task input(s) and image(s). This method forwards the
         `task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
         `None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
+        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the
         docstring of the above two methods for more information.
 
         Args:
@@ -137,7 +137,7 @@ def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwar
 
     def encode_inputs(self, images=None, task_inputs=None, segmentation_maps=None, **kwargs):
         """
-        This method forwards all its arguments to [`OneFormerImageProcessor.encode_inputs`] and then tokenizes the
+        This method forwards all its arguments to [`AutoImageProcessor.encode_inputs`] and then tokenizes the
         task_inputs. Please refer to the docstring of this method for more information.
         """
 
@@ -177,21 +177,21 @@ def encode_inputs(self, images=None, task_inputs=None, segmentation_maps=None, *
 
     def post_process_semantic_segmentation(self, *args, **kwargs):
         """
-        This method forwards all its arguments to [`OneFormerImageProcessor.post_process_semantic_segmentation`].
+        This method forwards all its arguments to [`AutoImageProcessor.post_process_semantic_segmentation`].
         Please refer to the docstring of this method for more information.
         """
         return self.image_processor.post_process_semantic_segmentation(*args, **kwargs)
 
     def post_process_instance_segmentation(self, *args, **kwargs):
         """
-        This method forwards all its arguments to [`OneFormerImageProcessor.post_process_instance_segmentation`].
+        This method forwards all its arguments to [`AutoImageProcessor.post_process_instance_segmentation`].
         Please refer to the docstring of this method for more information.
         """
         return self.image_processor.post_process_instance_segmentation(*args, **kwargs)
 
     def post_process_panoptic_segmentation(self, *args, **kwargs):
         """
-        This method forwards all its arguments to [`OneFormerImageProcessor.post_process_panoptic_segmentation`].
+        This method forwards all its arguments to [`AutoImageProcessor.post_process_panoptic_segmentation`].
         Please refer to the docstring of this method for more information.
         """
         return self.image_processor.post_process_panoptic_segmentation(*args, **kwargs)
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index e7fb401d9a76..7569de9be876 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -60,14 +60,14 @@ class OwlViTProcessor(ProcessorMixin):
     [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`OwlViTImageProcessor`], *optional*):
+        image_processor ([`AutoImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "OwlViTImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
@@ -94,7 +94,7 @@ def __call__(
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
         the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
index 4a1af1d6bb78..ca9e5f4a168c 100644
--- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
@@ -46,7 +46,7 @@ class Phi4MultimodalProcessor(ProcessorMixin):
     [`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information.
 
     Args:
-        image_processor (`Phi4MultimodalImageProcessorFast`):
+        image_processor (`AutoImageProcessor`):
             The image processor to use for images.
         audio_processor (`Phi4MultimodalFeatureExtractor`):
             The audio processor to use for audio inputs.
@@ -60,7 +60,7 @@ class Phi4MultimodalProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "audio_processor", "tokenizer"]
     tokenizer_class = "GPT2TokenizerFast"
-    image_processor_class = "Phi4MultimodalImageProcessorFast"
+    image_processor_class = "AutoImageProcessor"
     audio_processor_class = "Phi4MultimodalFeatureExtractor"
 
     def __init__(
@@ -87,7 +87,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
         and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
+        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index fba2fe93ef19..c4c8f3f18d3d 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -55,14 +55,14 @@ class Pix2StructProcessor(ProcessorMixin):
     the docstring of [`~Pix2StructProcessor.__call__`] and [`~Pix2StructProcessor.decode`] for more information.
 
     Args:
-        image_processor (`Pix2StructImageProcessor`):
+        image_processor (`AutoImageProcessor`):
             An instance of [`Pix2StructImageProcessor`]. The image processor is a required input.
         tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
             An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Pix2StructImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
 
     def __init__(self, image_processor, tokenizer):
@@ -76,7 +76,7 @@ def __call__(
         **kwargs: Unpack[Pix2StructProcessorKwargs],
     ) -> Union[BatchEncoding, BatchFeature]:
         """
-        This method uses [`Pix2StructImageProcessor.preprocess`] method to prepare image(s) for the model, and
+        This method uses [`AutoImageProcessor.preprocess`] method to prepare image(s) for the model, and
         [`T5TokenizerFast.__call__`] to prepare text for the model.
 
         Please refer to the docstring of the above two methods for more information.
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index bc82daf2034d..0add0c9735ec 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -55,16 +55,16 @@ class SamProcessor(ProcessorMixin):
     Constructs a SAM processor which wraps a SAM image processor and an 2D points & Bounding boxes processor into a
     single processor.
 
-    [`SamProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of
-    [`~SamImageProcessor.__call__`] for more information.
+    [`SamProcessor`] offers all the functionalities of [`AutoImageProcessor`]. See the docstring of
+    [`~AutoImageProcessor.__call__`] for more information.
 
     Args:
-        image_processor (`SamImageProcessor`):
-            An instance of [`SamImageProcessor`]. The image processor is a required input.
+        image_processor (`AutoImageProcessor`):
+            An instance of [`AutoImageProcessor`]. The image processor is a required input.
     """
 
     attributes = ["image_processor"]
-    image_processor_class = "SamImageProcessor"
+    image_processor_class = "AutoImageProcessor"
 
     def __init__(self, image_processor):
         super().__init__(image_processor)
@@ -77,7 +77,7 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         """
-        This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
+        This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
         points and bounding boxes for the model if they are provided.
         """
         output_kwargs = self._merge_kwargs(
diff --git a/src/transformers/models/sam2/processing_sam2.py b/src/transformers/models/sam2/processing_sam2.py
index 5f147aab8dfa..45ebd1b64eed 100644
--- a/src/transformers/models/sam2/processing_sam2.py
+++ b/src/transformers/models/sam2/processing_sam2.py
@@ -40,12 +40,12 @@ class Sam2Processor(ProcessorMixin):
     Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a
     single processor.
 
-    [`Sam2Processor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of
-    [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
+    [`Sam2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`Sam2VideoProcessor`]. See the docstring of
+    [`~AutoImageProcessor.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
 
     Args:
-        image_processor (`Sam2ImageProcessorFast`):
-            An instance of [`Sam2ImageProcessorFast`].
+        image_processor (`AutoImageProcessor`):
+            An instance of [`AutoImageProcessor`].
         target_size (`int`, *optional*):
             The target size (target_size, target_size) to which the image will be resized.
         point_pad_value (`int`, *optional*, defaults to -10):
@@ -53,7 +53,7 @@ class Sam2Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor"]
-    image_processor_class = "Sam2ImageProcessorFast"
+    image_processor_class = "AutoImageProcessor"
 
     def __init__(self, image_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs):
         super().__init__(image_processor, **kwargs)
@@ -72,7 +72,7 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         r"""
-        This method uses [`Sam2ImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D
+        This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
         points and bounding boxes for the model if they are provided.
 
         Args:
diff --git a/src/transformers/models/sam2_video/modular_sam2_video.py b/src/transformers/models/sam2_video/modular_sam2_video.py
index 091844f0aa1c..4dcdf914c89b 100644
--- a/src/transformers/models/sam2_video/modular_sam2_video.py
+++ b/src/transformers/models/sam2_video/modular_sam2_video.py
@@ -606,12 +606,12 @@ class Sam2VideoProcessor(Sam2Processor):
     Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a
     single processor.
 
-    [`Sam2VideoProcessor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of
-    [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
+    [`Sam2VideoProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`Sam2VideoProcessor`]. See the docstring of
+    [`~AutoImageProcessor.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
 
     Args:
-        image_processor (`Sam2ImageProcessorFast`):
-            An instance of [`Sam2ImageProcessorFast`].
+        image_processor (`AutoImageProcessor`):
+            An instance of [`AutoImageProcessor`].
         video_processor (`Sam2VideoVideoProcessor`):
             An instance of [`Sam2VideoVideoProcessor`].
         target_size (`int`, *optional*):
@@ -621,7 +621,7 @@ class Sam2VideoProcessor(Sam2Processor):
     """
 
     attributes = ["image_processor", "video_processor"]
-    image_processor_class = "Sam2ImageProcessorFast"
+    image_processor_class = "AutoImageProcessor"
     video_processor_class = "Sam2VideoVideoProcessor"
 
     def __init__(
diff --git a/src/transformers/models/sam2_video/processing_sam2_video.py b/src/transformers/models/sam2_video/processing_sam2_video.py
index d5a3c94d7f87..5fd79a234183 100644
--- a/src/transformers/models/sam2_video/processing_sam2_video.py
+++ b/src/transformers/models/sam2_video/processing_sam2_video.py
@@ -39,11 +39,11 @@ class Sam2VideoProcessor(ProcessorMixin):
     Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a
     single processor.
 
-    [`Sam2VideoProcessor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of
+    [`Sam2VideoProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`Sam2VideoProcessor`]. See the docstring of
     [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
 
     Args:
-        image_processor (`Sam2ImageProcessorFast`):
+        image_processor (`AutoImageProcessor`):
             An instance of [`Sam2ImageProcessorFast`].
         video_processor (`Sam2VideoVideoProcessor`):
             An instance of [`Sam2VideoVideoProcessor`].
@@ -54,7 +54,7 @@ class Sam2VideoProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "video_processor"]
-    image_processor_class = "Sam2ImageProcessorFast"
+    image_processor_class = "AutoImageProcessor"
     video_processor_class = "Sam2VideoVideoProcessor"
 
     def __init__(
@@ -76,7 +76,7 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         r"""
-        This method uses [`Sam2VideoImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D
+        This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
         points and bounding boxes for the model if they are provided.
 
         Args:
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index 902e68832836..0281bec63489 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -55,16 +55,16 @@ class SamHQProcessor(ProcessorMixin):
     Constructs a SAM HQ processor which wraps a SAM  image processor and an 2D points & Bounding boxes processor into a
     single processor.
 
-    [`SamHQProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of
-    [`~SamImageProcessor.__call__`] for more information.
+    [`SamHQProcessor`] offers all the functionalities of [`AutoImageProcessor`]. See the docstring of
+    [`~AutoImageProcessor.__call__`] for more information.
 
     Args:
-        image_processor (`SamImageProcessor`):
-            An instance of [`SamImageProcessor`]. The image processor is a required input.
+        image_processor (`AutoImageProcessor`):
+            An instance of [`AutoImageProcessor`]. The image processor is a required input.
     """
 
     attributes = ["image_processor"]
-    image_processor_class = "SamImageProcessor"
+    image_processor_class = "AutoImageProcessor"
 
     def __init__(self, image_processor):
         super().__init__(image_processor)
@@ -82,7 +82,7 @@ def __call__(
         **kwargs: Unpack[SamHQProcessorKwargs],
     ) -> BatchEncoding:
         """
-        This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
+        This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
         points and bounding boxes for the model if they are provided.
         """
         output_kwargs = self._merge_kwargs(
diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py
index b16650303da4..4a9dbec65d4e 100644
--- a/src/transformers/models/siglip2/processing_siglip2.py
+++ b/src/transformers/models/siglip2/processing_siglip2.py
@@ -37,11 +37,11 @@ class Siglip2Processor(ProcessorMixin):
     r"""
     Constructs a Siglip2 processor which wraps a Siglip2 image processor and a Gemma tokenizer into a single processor.
 
-    [`Siglip2Processor`] offers all the functionalities of [`Siglip2ImageProcessor`] and [`GemmaTokenizerFast`]. See the
+    [`Siglip2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`GemmaTokenizerFast`]. See the
     [`~Siglip2Processor.__call__`] and [`~Siglip2Processor.decode`] for more information.
 
     Args:
-        image_processor ([`Siglip2ImageProcessor`]):
+        image_processor ([`AutoImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`GemmaTokenizerFast`]):
             The tokenizer is a required input.
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 86d07e238f1b..bf400dbb0c15 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -127,7 +127,7 @@ class SmolVLMProcessor(ProcessorMixin):
     the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
 
     Args:
-        image_processor (`SmolVLMImageProcessor`):
+        image_processor (`AutoImageProcessor`):
             An instance of [`SmolVLMImageProcessor`]. The image processor is a required input.
         tokenizer (`PreTrainedTokenizerBase`):
             An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
@@ -142,7 +142,7 @@ class SmolVLMProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer", "video_processor"]
-    image_processor_class = "SmolVLMImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     video_processor_class = "SmolVLMVideoProcessor"  # NOTE: uses different interpolation than slow processors
     tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index 7cec0f14ab76..0bb0a265f5e7 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -38,14 +38,14 @@ class TvpProcessor(ProcessorMixin):
     [`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`TvpImageProcessor`], *optional*):
+        image_processor ([`AutoImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`BertTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "TvpImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index 1be71aea63e2..9669f5eb8e2e 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -67,14 +67,14 @@ class UdopProcessor(ProcessorMixin):
     prepare labels for language modeling tasks.
 
     Args:
-        image_processor (`LayoutLMv3ImageProcessor`):
+        image_processor (`AutoImageProcessor`):
             An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
         tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
             An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LayoutLMv3ImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast")
 
     def __init__(self, image_processor, tokenizer):
@@ -88,9 +88,9 @@ def __call__(
     ) -> BatchFeature:
         """
         This method first forwards the `images` argument to [`~UdopImageProcessor.__call__`]. In case
-        [`UdopImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        [`AutoImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
         bounding boxes along with the additional arguments to [`~UdopTokenizer.__call__`] and returns the output,
-        together with the prepared `pixel_values`. In case [`UdopImageProcessor`] was initialized with `apply_ocr` set
+        together with the prepared `pixel_values`. In case [`AutoImageProcessor`] was initialized with `apply_ocr` set
         to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the
         additional arguments to [`~UdopTokenizer.__call__`] and returns the output, together with the prepared
         `pixel_values`.
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index a6f826fa72a3..1506b15d458b 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -32,13 +32,13 @@
 
 class VideoLlavaProcessor(ProcessorMixin):
     r"""
-    Constructs a VideoLlava processor which wraps a VideoLlava image processor and a Llava tokenizer into a single processor.
+    Constructs a VideoLlava processor which wraps a AutoImageProcessor and a Llava tokenizer into a single processor.
 
     [`VideoLlavaProcessor`] offers all the functionalities of [`VideoLlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the
     [`~VideoLlavaProcessor.__call__`] and [`~VideoLlavaProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`VideoLlavaImageProcessor`], *optional*):
+        image_processor ([`AutoImageProcessor`], *optional*):
             The image processor is a required input.
         video_processor ([`VideoLlavaVideoProcessor`], *optional*):
             The video processor is a required input.
@@ -61,7 +61,7 @@ class VideoLlavaProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "video_processor", "tokenizer"]
-    image_processor_class = "VideoLlavaImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     video_processor_class = "AutoVideoProcessor"
     tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 5b5126ad4a85..209ab1f362ed 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -38,20 +38,20 @@ class ViltProcessorKwargs(ProcessingKwargs, total=False):
 
 class ViltProcessor(ProcessorMixin):
     r"""
-    Constructs a ViLT processor which wraps a BERT tokenizer and ViLT image processor into a single processor.
+    Constructs a ViLT processor which wraps a BERT tokenizer and AutoImageProcessor into a single processor.
 
     [`ViltProcessor`] offers all the functionalities of [`ViltImageProcessor`] and [`BertTokenizerFast`]. See the
     docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
 
     Args:
-        image_processor (`ViltImageProcessor`, *optional*):
-            An instance of [`ViltImageProcessor`]. The image processor is a required input.
+        image_processor (`AutoImageProcessor`, *optional*):
+            An instance of [`AutoImageProcessor`]. The image processor is a required input.
         tokenizer (`BertTokenizerFast`, *optional*):
             An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "ViltImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
     valid_processor_kwargs = ViltProcessorKwargs
 
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index 581dabc6d8b5..d2878c3e2018 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -23,20 +23,20 @@
 
 class XCLIPProcessor(ProcessorMixin):
     r"""
-    Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
+    Constructs an X-CLIP processor which wraps a AutoImageProcessor and a CLIP tokenizer into a single processor.
 
-    [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
+    [`XCLIPProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`CLIPTokenizerFast`]. See the
     [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`VideoMAEImageProcessor`], *optional*):
+        image_processor ([`AutoImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "VideoMAEImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):

From c35a1c803e5e567b228a13c0008e132169540739 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Mon, 6 Oct 2025 19:09:51 +0000
Subject: [PATCH 3/4] fix-copies

---
 src/transformers/models/owlv2/processing_owlv2.py           | 3 +--
 src/transformers/models/sam2_video/processing_sam2_video.py | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 65f111e2ca79..8be195c78e10 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -84,7 +84,7 @@ def __call__(
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
         the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
@@ -151,7 +151,6 @@ def __call__(
             if return_tensors == "np":
                 input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
                 attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
-
             elif return_tensors == "pt" and is_torch_available():
                 import torch
 
diff --git a/src/transformers/models/sam2_video/processing_sam2_video.py b/src/transformers/models/sam2_video/processing_sam2_video.py
index 5fd79a234183..e3e244d8570a 100644
--- a/src/transformers/models/sam2_video/processing_sam2_video.py
+++ b/src/transformers/models/sam2_video/processing_sam2_video.py
@@ -40,11 +40,11 @@ class Sam2VideoProcessor(ProcessorMixin):
     single processor.
 
     [`Sam2VideoProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`Sam2VideoProcessor`]. See the docstring of
-    [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
+    [`~AutoImageProcessor.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
 
     Args:
         image_processor (`AutoImageProcessor`):
-            An instance of [`Sam2ImageProcessorFast`].
+            An instance of [`AutoImageProcessor`].
         video_processor (`Sam2VideoVideoProcessor`):
             An instance of [`Sam2VideoVideoProcessor`].
         target_size (`int`, *optional*):

From 94696184d43029a30769e37215a873824aedd51f Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 15 Oct 2025 10:07:16 +0000
Subject: [PATCH 4/4] Fix more tests

---
 .../models/auto/tokenization_auto.py          |   1 +
 .../models/blip/processing_blip.py            |   2 +-
 .../models/blip_2/processing_blip_2.py        |   2 +-
 .../chinese_clip/processing_chinese_clip.py   |   4 +-
 .../models/clip/processing_clip.py            |   2 +-
 .../models/clipseg/processing_clipseg.py      |   4 +-
 .../instructblip/processing_instructblip.py   |   2 +-
 .../models/kosmos2/processing_kosmos2.py      |   2 +-
 src/transformers/processing_utils.py          | 125 ++++++++++++------
 tests/models/align/test_processing_align.py   |  22 +--
 tests/models/blip/test_processing_blip.py     |  23 +++-
 tests/models/blip_2/test_processing_blip_2.py |  18 ++-
 .../test_processing_chinese_clip.py           |  27 ++--
 tests/models/clip/test_processing_clip.py     |  27 ++--
 .../models/clipseg/test_processing_clipseg.py |  28 ++--
 tests/models/flava/test_processing_flava.py   |  30 +++--
 tests/models/git/test_processing_git.py       |  19 ++-
 ...ssor_glm4v.py => test_processing_glm4v.py} |   0
 .../test_processing_grounding_dino.py         |  24 +++-
 .../test_processing_instructblip.py           |  17 ++-
 .../models/kosmos2/test_processing_kosmos2.py |  44 +++---
 ...mos2_5.py => test_processing_kosmos2_5.py} |  20 +--
 ...ssor_ovis2.py => test_processing_ovis2.py} |   0
 ...cessor_sam2.py => test_processing_sam2.py} |   0
 ...video.py => test_processing_sam2_video.py} |   0
 tests/test_processing_common.py               |  32 ++++-
 26 files changed, 313 insertions(+), 162 deletions(-)
 rename tests/models/glm4v/{test_processor_glm4v.py => test_processing_glm4v.py} (100%)
 rename tests/models/kosmos2_5/{test_processor_kosmos2_5.py => test_processing_kosmos2_5.py} (96%)
 rename tests/models/ovis2/{test_processor_ovis2.py => test_processing_ovis2.py} (100%)
 rename tests/models/sam2/{test_processor_sam2.py => test_processing_sam2.py} (100%)
 rename tests/models/sam2_video/{test_processor_sam2_video.py => test_processing_sam2_video.py} (100%)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ccee9937afa6..35dd6960ed37 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -253,6 +253,7 @@
             ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None),
         ),
         ("flaubert", ("FlaubertTokenizer", None)),
+        ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
         ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
         ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
         ("fsmt", ("FSMTTokenizer", None)),
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index f600e8ce27d8..a54436bcafd6 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -54,7 +54,7 @@ class BlipProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor, tokenizer, **kwargs):
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 40729f4f4501..4382cc3cfaa0 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -61,7 +61,7 @@ class Blip2Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 0510b9b0f3c9..7c6bd572deeb 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -35,8 +35,8 @@ class ChineseCLIPProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast")
-    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index 7b856f9981ee..fe56058996e4 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -34,7 +34,7 @@ class CLIPProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index 39e091106c71..4a8ee28f65f5 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -35,8 +35,8 @@ class CLIPSegProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
-    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index afe43c1fc7a7..8b749aa21d99 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -66,7 +66,7 @@ class InstructBlipProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer", "qformer_tokenizer"]
-    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
     qformer_tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index f9fb98df6ac2..5db8d32b9fe2 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -86,7 +86,7 @@ class Kosmos2Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 55844c8d9cce..e96fe5c4767e 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -36,24 +36,6 @@
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature
 from .image_utils import ChannelDimension, ImageInput, is_vision_available
-from .utils.chat_template_utils import render_jinja_template
-from .utils.type_validators import (
-    device_validator,
-    image_size_validator,
-    padding_validator,
-    positive_any_number,
-    positive_int,
-    resampling_validator,
-    tensor_type_validator,
-    truncation_validator,
-    video_metadata_validator,
-)
-from .video_utils import VideoInput, VideoMetadataType
-
-
-if is_vision_available():
-    from .image_utils import PILImageResampling
-
 from .tokenization_utils_base import (
     PaddingStrategy,
     PreTokenizedInput,
@@ -79,12 +61,27 @@
     list_repo_templates,
     logging,
 )
+from .utils.chat_template_utils import render_jinja_template
 from .utils.deprecation import deprecate_kwarg
+from .utils.type_validators import (
+    device_validator,
+    image_size_validator,
+    padding_validator,
+    positive_any_number,
+    positive_int,
+    resampling_validator,
+    tensor_type_validator,
+    truncation_validator,
+    video_metadata_validator,
+)
+from .video_utils import VideoInput, VideoMetadataType
 
 
 if is_torch_available():
     from .modeling_utils import PreTrainedAudioTokenizerBase
 
+if is_vision_available():
+    from .image_utils import PILImageResampling
 
 logger = logging.get_logger(__name__)
 
@@ -95,6 +92,38 @@
 transformers_module = direct_transformers_import(Path(__file__).parent)
 
 
+class _LazyAutoProcessorMapping(dict):
+    """
+    Lazy dictionary to avoid circular imports.
+    The mapping names are only imported when accessed.
+    """
+
+    _MAPPING_NAMES = {
+        "image_processor": ("transformers.models.auto.image_processing_auto", "IMAGE_PROCESSOR_MAPPING_NAMES"),
+        "video_processor": ("transformers.models.auto.video_processing_auto", "VIDEO_PROCESSOR_MAPPING_NAMES"),
+        "feature_extractor": (
+            "transformers.models.auto.feature_extraction_auto",
+            "FEATURE_EXTRACTOR_MAPPING_NAMES",
+        ),
+        "tokenizer": ("transformers.models.auto.tokenization_auto", "TOKENIZER_MAPPING_NAMES"),
+    }
+
+    def __getitem__(self, key):
+        if key not in self._MAPPING_NAMES:
+            raise KeyError(key)
+        module_name, attr_name = self._MAPPING_NAMES[key]
+        module = __import__(module_name, fromlist=[attr_name])
+        return getattr(module, attr_name)
+
+    def __contains__(self, key):
+        return key in self._MAPPING_NAMES
+
+    def keys(self):
+        return self._MAPPING_NAMES.keys()
+
+
+MODALITY_TO_AUTOPROCESSOR_MAPPING = _LazyAutoProcessorMapping()
+
 AUTO_TO_BASE_CLASS_MAPPING = {
     "AutoTokenizer": "PreTrainedTokenizerBase",
     "AutoFeatureExtractor": "FeatureExtractionMixin",
@@ -102,6 +131,11 @@
     "AutoVideoProcessor": "BaseVideoProcessor",
 }
 
+SPECIAL_MODULE_TO_MODEL_NAME_MAPPING = {
+    "kosmos2_5": "kosmos-2.5",
+    "kosmos2": "kosmos-2",
+}
+
 if sys.version_info >= (3, 11):
     Unpack = typing.Unpack
 else:
@@ -1497,30 +1531,45 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
         via methods like `AutoTokenizer.register()`. If neither of these conditions are fulfilled, this method
         will be unable to find the relevant subcomponent class and will raise an error.
         """
+        # Lazy import to avoid circular imports
+
         args = []
-        for attribute_name in cls.attributes:
-            class_name = getattr(cls, f"{attribute_name}_class")
-            if isinstance(class_name, tuple):
-                classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name)
-                if attribute_name == "image_processor":
-                    # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
-                    use_fast = kwargs.get("use_fast")
-                    if use_fast is None:
-                        logger.warning_once(
-                            "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
-                            "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
-                            "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
-                        )
-                else:
+        # get args from processor init signature
+        model_name_lowercase = cls.__module__.split(".")[-1].replace("processing_", "").split(".")[0]
+        sub_processors = inspect.signature(cls.__init__).parameters.keys()
+        for sub_processor_type in sub_processors:
+            if sub_processor_type not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in sub_processor_type:
+                sub_processor_type = "tokenizer"
+            if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
+                sub_processor_names = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type].get(
+                    model_name_lowercase, None
+                )
+                if sub_processor_names is None:
+                    sub_processor_names = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type].get(
+                        model_name_lowercase.replace("_", "-"), None
+                    )
+                if sub_processor_names is None:
+                    sub_processor_names = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type].get(
+                        SPECIAL_MODULE_TO_MODEL_NAME_MAPPING.get(model_name_lowercase, None), None
+                    )
+                if sub_processor_names is None:
+                    raise ValueError(
+                        f"Could not find component class name for {sub_processor_type} and {model_name_lowercase}"
+                    )
+                if isinstance(sub_processor_names, tuple):
                     use_fast = kwargs.get("use_fast", True)
-                if use_fast and classes[1] is not None:
-                    attribute_class = classes[1]
+                    if use_fast and sub_processor_names[1] is not None:
+                        sub_processor_name = sub_processor_names[1]
+                    else:
+                        sub_processor_name = sub_processor_names[0]
                 else:
-                    attribute_class = classes[0]
-            else:
-                attribute_class = cls.get_possibly_dynamic_module(class_name)
+                    sub_processor_name = sub_processor_names
 
-            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+                if hasattr(transformers_module, sub_processor_name):
+                    sub_processor_class = getattr(transformers_module, sub_processor_name)
+                    args.append(sub_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+                else:
+                    raise ValueError(f"Could not find module {sub_processor_name} in `transformers`.")
 
         return args
 
diff --git a/tests/models/align/test_processing_align.py b/tests/models/align/test_processing_align.py
index 0adfc5a82205..8b2979456d7c 100644
--- a/tests/models/align/test_processing_align.py
+++ b/tests/models/align/test_processing_align.py
@@ -23,13 +23,15 @@
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
 
 if is_vision_available():
     from transformers import AlignProcessor, EfficientNetImageProcessor
+if is_torchvision_available():
+    from transformers import EfficientNetImageProcessorFast
 
 
 @require_vision
@@ -80,6 +82,9 @@ def get_rust_tokenizer(self, **kwargs):
     def get_image_processor(self, **kwargs):
         return EfficientNetImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
 
+    def get_image_processor_fast(self, **kwargs):
+        return EfficientNetImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs)
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -87,12 +92,13 @@ def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()
         tokenizer_fast = self.get_rust_tokenizer()
         image_processor = self.get_image_processor()
+        image_processor_fast = self.get_image_processor_fast()
 
         processor_slow = AlignProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
         processor_slow.save_pretrained(self.tmpdirname)
         processor_slow = AlignProcessor.from_pretrained(self.tmpdirname, use_fast=False)
 
-        processor_fast = AlignProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+        processor_fast = AlignProcessor(tokenizer=tokenizer_fast, image_processor=image_processor_fast)
         processor_fast.save_pretrained(self.tmpdirname)
         processor_fast = AlignProcessor.from_pretrained(self.tmpdirname)
 
@@ -103,16 +109,16 @@ def test_save_load_pretrained_default(self):
         self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string())
         self.assertIsInstance(processor_slow.image_processor, EfficientNetImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, EfficientNetImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, EfficientNetImageProcessorFast)
 
     def test_save_load_pretrained_additional_features(self):
         processor = AlignProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
         processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+        image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False, padding_value=1.0)
 
         processor = AlignProcessor.from_pretrained(
             self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
@@ -122,7 +128,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, EfficientNetImageProcessor)
+        self.assertIsInstance(processor.image_processor, EfficientNetImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
@@ -132,8 +138,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_image_proc = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_image_proc = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_image_proc:
             self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
diff --git a/tests/models/blip/test_processing_blip.py b/tests/models/blip/test_processing_blip.py
index d9f045332ed3..a807b210e52d 100644
--- a/tests/models/blip/test_processing_blip.py
+++ b/tests/models/blip/test_processing_blip.py
@@ -17,17 +17,26 @@
 
 import pytest
 
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
+from transformers.testing_utils import require_torch, require_torchvision, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
 
 if is_vision_available():
-    from transformers import AutoProcessor, BertTokenizer, BlipImageProcessor, BlipProcessor, PreTrainedTokenizerFast
+    from transformers import (
+        AutoProcessor,
+        BertTokenizer,
+        BlipProcessor,
+        PreTrainedTokenizerFast,
+    )
+
+if is_torchvision_available():
+    from transformers import BlipImageProcessorFast
 
 
 @require_vision
+@require_torchvision
 class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = BlipProcessor
 
@@ -35,7 +44,7 @@ class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         cls.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = BlipImageProcessor()
+        image_processor = BlipImageProcessorFast()
         tokenizer = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
 
         processor = BlipProcessor(image_processor, tokenizer)
@@ -68,7 +77,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, BlipImageProcessor)
+        self.assertIsInstance(processor.image_processor, BlipImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
@@ -78,8 +87,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
diff --git a/tests/models/blip_2/test_processing_blip_2.py b/tests/models/blip_2/test_processing_blip_2.py
index e5c17a11ce02..1a4d7f3894aa 100644
--- a/tests/models/blip_2/test_processing_blip_2.py
+++ b/tests/models/blip_2/test_processing_blip_2.py
@@ -17,17 +17,21 @@
 
 import pytest
 
-from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available
+from transformers.testing_utils import require_torchvision, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
 
 if is_vision_available():
-    from transformers import AutoProcessor, Blip2Processor, BlipImageProcessor, GPT2Tokenizer, PreTrainedTokenizerFast
+    from transformers import AutoProcessor, Blip2Processor, GPT2Tokenizer, PreTrainedTokenizerFast
+
+if is_torchvision_available():
+    from transformers import BlipImageProcessorFast
 
 
 @require_vision
+@require_torchvision
 class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Blip2Processor
 
@@ -35,7 +39,7 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         cls.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = BlipImageProcessor()
+        image_processor = BlipImageProcessorFast()
         tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
 
         processor = Blip2Processor(image_processor, tokenizer)
@@ -71,7 +75,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, BlipImageProcessor)
+        self.assertIsInstance(processor.image_processor, BlipImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
@@ -81,8 +85,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
diff --git a/tests/models/chinese_clip/test_processing_chinese_clip.py b/tests/models/chinese_clip/test_processing_chinese_clip.py
index 5aef3d06c15b..9407b503ce79 100644
--- a/tests/models/chinese_clip/test_processing_chinese_clip.py
+++ b/tests/models/chinese_clip/test_processing_chinese_clip.py
@@ -22,8 +22,8 @@
 
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision
-from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
+from transformers.testing_utils import require_torchvision, require_vision
+from transformers.utils import FEATURE_EXTRACTOR_NAME, is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -31,8 +31,12 @@
 if is_vision_available():
     from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor
 
+if is_torchvision_available():
+    from transformers import ChineseCLIPImageProcessorFast
+
 
 @require_vision
+@require_torchvision
 class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = ChineseCLIPProcessor
 
@@ -95,6 +99,10 @@ def get_rust_tokenizer(cls, **kwargs):
     def get_image_processor(cls, **kwargs):
         return ChineseCLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs)
 
+    @classmethod
+    def get_image_processor_fast(cls, **kwargs):
+        return ChineseCLIPImageProcessorFast.from_pretrained(cls.tmpdirname, **kwargs)
+
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@@ -103,13 +111,14 @@ def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()
         tokenizer_fast = self.get_rust_tokenizer()
         image_processor = self.get_image_processor()
+        image_processor_fast = self.get_image_processor_fast()
 
         with tempfile.TemporaryDirectory() as tmpdir:
             processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
             processor_slow.save_pretrained(tmpdir)
             processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
 
-            processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+            processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor_fast)
             processor_fast.save_pretrained(tmpdir)
             processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname)
 
@@ -120,9 +129,9 @@ def test_save_load_pretrained_default(self):
         self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string())
         self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessorFast)
 
     def test_save_load_pretrained_additional_features(self):
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -132,7 +141,7 @@ def test_save_load_pretrained_additional_features(self):
             processor.save_pretrained(tmpdir)
 
             tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)")
-            image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
+            image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False)
 
             processor = ChineseCLIPProcessor.from_pretrained(
                 tmpdir, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False
@@ -142,7 +151,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor)
+        self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
@@ -152,8 +161,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
diff --git a/tests/models/clip/test_processing_clip.py b/tests/models/clip/test_processing_clip.py
index 6ca9a47b29c7..a9d38bf6703f 100644
--- a/tests/models/clip/test_processing_clip.py
+++ b/tests/models/clip/test_processing_clip.py
@@ -18,9 +18,9 @@
 
 import pytest
 
-from transformers import AutoTokenizer, CLIPTokenizer, CLIPTokenizerFast
-from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available
+from transformers import AutoImageProcessor, AutoTokenizer, CLIPTokenizer, CLIPTokenizerFast
+from transformers.testing_utils import require_torchvision, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -28,11 +28,15 @@
 if is_vision_available():
     from transformers import CLIPImageProcessor, CLIPProcessor
 
+if is_torchvision_available():
+    from transformers import CLIPImageProcessorFast
+
 
 TEST_MODEL_PATH = "openai/clip-vit-base-patch32"
 
 
 @require_vision
+@require_torchvision
 class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = CLIPProcessor
 
@@ -40,7 +44,7 @@ class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         cls.tmpdirname = tempfile.mkdtemp()
         tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_PATH)
-        image_processor = CLIPImageProcessor.from_pretrained(TEST_MODEL_PATH)
+        image_processor = AutoImageProcessor.from_pretrained(TEST_MODEL_PATH)
         processor = CLIPProcessor(
             image_processor=image_processor,
             tokenizer=tokenizer,
@@ -59,6 +63,10 @@ def get_rust_tokenizer(cls, **kwargs):
     def get_image_processor(cls, **kwargs):
         return CLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs)
 
+    @classmethod
+    def get_image_processor_fast(cls, **kwargs):
+        return CLIPImageProcessorFast.from_pretrained(cls.tmpdirname, **kwargs)
+
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname)
@@ -67,6 +75,7 @@ def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()
         tokenizer_fast = self.get_rust_tokenizer()
         image_processor = self.get_image_processor()
+        image_processor_fast = self.get_image_processor_fast()
 
         with tempfile.TemporaryDirectory() as tmpdir:
             processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
@@ -84,17 +93,17 @@ def test_save_load_pretrained_default(self):
         self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
 
         self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string())
         self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessorFast)
 
     def test_save_load_pretrained_additional_features(self):
         with tempfile.TemporaryDirectory() as tmpdir:
-            processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+            processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor_fast())
             processor.save_pretrained(tmpdir)
 
             tokenizer_add_kwargs = CLIPTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)")
-            image_processor_add_kwargs = CLIPImageProcessor.from_pretrained(
+            image_processor_add_kwargs = CLIPImageProcessorFast.from_pretrained(
                 tmpdir, do_normalize=False, padding_value=1.0
             )
 
@@ -106,7 +115,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
+        self.assertIsInstance(processor.image_processor, CLIPImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
diff --git a/tests/models/clipseg/test_processing_clipseg.py b/tests/models/clipseg/test_processing_clipseg.py
index f7255838caa8..2f6a492cd408 100644
--- a/tests/models/clipseg/test_processing_clipseg.py
+++ b/tests/models/clipseg/test_processing_clipseg.py
@@ -22,8 +22,8 @@
 
 from transformers import CLIPTokenizer, CLIPTokenizerFast
 from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from transformers.testing_utils import require_torchvision, require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -31,8 +31,12 @@
 if is_vision_available():
     from transformers import CLIPSegProcessor, ViTImageProcessor
 
+if is_torchvision_available():
+    from transformers import ViTImageProcessorFast
+
 
 @require_vision
+@require_torchvision
 class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = CLIPSegProcessor
 
@@ -73,6 +77,9 @@ def get_rust_tokenizer(self, **kwargs):
     def get_image_processor(self, **kwargs):
         return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
 
+    def get_image_processor_fast(self, **kwargs):
+        return ViTImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs)
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -80,12 +87,13 @@ def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()
         tokenizer_fast = self.get_rust_tokenizer()
         image_processor = self.get_image_processor()
+        image_processor_fast = self.get_image_processor_fast()
 
         processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
         processor_slow.save_pretrained(self.tmpdirname)
         processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False)
 
-        processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+        processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor_fast)
         processor_fast.save_pretrained(self.tmpdirname)
         processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname)
 
@@ -96,16 +104,16 @@ def test_save_load_pretrained_default(self):
         self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
 
         self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string())
         self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, ViTImageProcessorFast)
 
     def test_save_load_pretrained_additional_features(self):
-        processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor_fast())
         processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+        image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False, padding_value=1.0)
 
         processor = CLIPSegProcessor.from_pretrained(
             self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
@@ -115,7 +123,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
+        self.assertIsInstance(processor.image_processor, ViTImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
@@ -125,8 +133,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
diff --git a/tests/models/flava/test_processing_flava.py b/tests/models/flava/test_processing_flava.py
index 10a00a869915..7500ac13707e 100644
--- a/tests/models/flava/test_processing_flava.py
+++ b/tests/models/flava/test_processing_flava.py
@@ -23,8 +23,8 @@
 
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from transformers.testing_utils import require_torchvision, require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -38,8 +38,12 @@
         FLAVA_IMAGE_STD,
     )
 
+if is_torchvision_available():
+    from transformers import FlavaImageProcessorFast
+
 
 @require_vision
+@require_torchvision
 class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = FlavaProcessor
 
@@ -89,6 +93,9 @@ def get_rust_tokenizer(self, **kwargs):
     def get_image_processor(self, **kwargs):
         return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
 
+    def get_image_processor_fast(self, **kwargs):
+        return FlavaImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs)
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -96,12 +103,13 @@ def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()
         tokenizer_fast = self.get_rust_tokenizer()
         image_processor = self.get_image_processor()
+        image_processor_fast = self.get_image_processor_fast()
 
         processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
         processor_slow.save_pretrained(self.tmpdirname)
         processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False)
 
-        processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+        processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor_fast)
         processor_fast.save_pretrained(self.tmpdirname)
         processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname)
 
@@ -112,16 +120,16 @@ def test_save_load_pretrained_default(self):
         self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string())
         self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessorFast)
 
     def test_save_load_pretrained_additional_features(self):
         processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
         processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+        image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False, padding_value=1.0)
 
         processor = FlavaProcessor.from_pretrained(
             self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
@@ -131,7 +139,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, FlavaImageProcessor)
+        self.assertIsInstance(processor.image_processor, FlavaImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
@@ -141,8 +149,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
@@ -150,11 +158,11 @@ def test_image_processor(self):
         # With rest of the args
         random.seed(1234)
         input_feat_extract = image_processor(
-            image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
+            image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="pt"
         )
         random.seed(1234)
         input_processor = processor(
-            images=image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
+            images=image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="pt"
         )
 
         for key in input_feat_extract:
diff --git a/tests/models/git/test_processing_git.py b/tests/models/git/test_processing_git.py
index 5e06636007bc..a1842ffaaa81 100644
--- a/tests/models/git/test_processing_git.py
+++ b/tests/models/git/test_processing_git.py
@@ -17,8 +17,8 @@
 
 import pytest
 
-from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available
+from transformers.testing_utils import require_torchvision, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -26,8 +26,12 @@
 if is_vision_available():
     from transformers import AutoProcessor, BertTokenizer, CLIPImageProcessor, GitProcessor, PreTrainedTokenizerFast
 
+if is_torchvision_available():
+    from transformers import CLIPImageProcessorFast
+
 
 @require_vision
+@require_torchvision
 class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = GitProcessor
 
@@ -50,6 +54,9 @@ def get_tokenizer(self, **kwargs):
     def get_image_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
+    def get_image_processor_fast(self, **kwargs):
+        return CLIPImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs)
+
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@@ -60,7 +67,7 @@ def test_save_load_pretrained_additional_features(self):
             processor.save_pretrained(tmpdir)
 
             tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-            image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+            image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False, padding_value=1.0)
 
             processor = GitProcessor.from_pretrained(
                 tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
@@ -70,7 +77,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
+        self.assertIsInstance(processor.image_processor, CLIPImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
@@ -80,8 +87,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
diff --git a/tests/models/glm4v/test_processor_glm4v.py b/tests/models/glm4v/test_processing_glm4v.py
similarity index 100%
rename from tests/models/glm4v/test_processor_glm4v.py
rename to tests/models/glm4v/test_processing_glm4v.py
diff --git a/tests/models/grounding_dino/test_processing_grounding_dino.py b/tests/models/grounding_dino/test_processing_grounding_dino.py
index 088f240eee73..46cac4c97893 100644
--- a/tests/models/grounding_dino/test_processing_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processing_grounding_dino.py
@@ -23,8 +23,8 @@
 
 from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_torchvision, require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -38,8 +38,13 @@
     from transformers import GroundingDinoImageProcessor
 
 
+if is_torchvision_available():
+    from transformers import GroundingDinoImageProcessorFast
+
+
 @require_torch
 @require_vision
+@require_torchvision
 class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     from_pretrained_id = "IDEA-Research/grounding-dino-base"
     processor_class = GroundingDinoProcessor
@@ -108,6 +113,10 @@ def get_rust_tokenizer(cls, **kwargs):
     def get_image_processor(cls, **kwargs):
         return GroundingDinoImageProcessor.from_pretrained(cls.tmpdirname, **kwargs)
 
+    # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.get_image_processor_fast with CLIP->GroundingDino
+    def get_image_processor_fast(cls, **kwargs):
+        return GroundingDinoImageProcessorFast.from_pretrained(cls.tmpdirname, **kwargs)
+
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@@ -150,6 +159,7 @@ def test_save_load_pretrained_default(self):
         tokenizer_slow = self.get_tokenizer()
         tokenizer_fast = self.get_rust_tokenizer()
         image_processor = self.get_image_processor()
+        image_processor_fast = self.get_image_processor_fast()
 
         with tempfile.TemporaryDirectory() as tmpdir:
             processor_slow = GroundingDinoProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
@@ -167,20 +177,20 @@ def test_save_load_pretrained_default(self):
         self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string())
         self.assertIsInstance(processor_slow.image_processor, GroundingDinoImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessorFast)
 
     # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer
     def test_save_load_pretrained_additional_features(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             processor = GroundingDinoProcessor(
-                tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()
+                tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor_fast()
             )
             processor.save_pretrained(tmpdir)
 
             tokenizer_add_kwargs = BertTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)")
-            image_processor_add_kwargs = GroundingDinoImageProcessor.from_pretrained(
+            image_processor_add_kwargs = GroundingDinoImageProcessorFast.from_pretrained(
                 tmpdir, do_normalize=False, padding_value=1.0
             )
 
@@ -192,7 +202,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessor)
+        self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessorFast)
 
     # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDino
     def test_image_processor(self):
diff --git a/tests/models/instructblip/test_processing_instructblip.py b/tests/models/instructblip/test_processing_instructblip.py
index 019fe85f72e1..0ec587b48bad 100644
--- a/tests/models/instructblip/test_processing_instructblip.py
+++ b/tests/models/instructblip/test_processing_instructblip.py
@@ -17,8 +17,8 @@
 
 import pytest
 
-from transformers.testing_utils import require_vision
-from transformers.utils import is_vision_available
+from transformers.testing_utils import require_torchvision, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -27,14 +27,17 @@
     from transformers import (
         AutoProcessor,
         BertTokenizerFast,
-        BlipImageProcessor,
         GPT2Tokenizer,
         InstructBlipProcessor,
         PreTrainedTokenizerFast,
     )
 
+if is_torchvision_available():
+    from transformers import BlipImageProcessorFast
+
 
 @require_vision
+@require_torchvision
 class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = InstructBlipProcessor
 
@@ -42,7 +45,7 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         cls.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = BlipImageProcessor()
+        image_processor = BlipImageProcessorFast()
         tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
         qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert")
 
@@ -86,7 +89,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, BlipImageProcessor)
+        self.assertIsInstance(processor.image_processor, BlipImageProcessorFast)
         self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast)
 
     def test_image_processor(self):
@@ -104,8 +107,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_feat_extract = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_feat_extract:
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
diff --git a/tests/models/kosmos2/test_processing_kosmos2.py b/tests/models/kosmos2/test_processing_kosmos2.py
index c2c98882ef02..a21c139619fe 100644
--- a/tests/models/kosmos2/test_processing_kosmos2.py
+++ b/tests/models/kosmos2/test_processing_kosmos2.py
@@ -21,6 +21,13 @@
 import numpy as np
 import pytest
 
+from transformers import (
+    AutoProcessor,
+    Kosmos2Processor,
+    PreTrainedTokenizerFast,
+    XLMRobertaTokenizer,
+    XLMRobertaTokenizerFast,
+)
 from transformers.image_utils import load_image
 from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.testing_utils import (
@@ -28,9 +35,10 @@
     require_sentencepiece,
     require_tokenizers,
     require_torch,
+    require_torchvision,
     require_vision,
 )
-from transformers.utils import is_vision_available
+from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
 
@@ -38,15 +46,9 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import (
-        AutoProcessor,
-        CLIPImageProcessor,
-        Kosmos2Processor,
-        PreTrainedTokenizerFast,
-        XLMRobertaTokenizer,
-        XLMRobertaTokenizerFast,
-    )
 
+if is_torchvision_available():
+    from transformers import CLIPImageProcessorFast
 
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 
@@ -54,6 +56,7 @@
 @require_sentencepiece
 @require_tokenizers
 @require_vision
+@require_torchvision
 class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Kosmos2Processor
 
@@ -61,7 +64,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         cls.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = CLIPImageProcessor(do_center_crop=False)
+        image_processor = CLIPImageProcessorFast(do_center_crop=False)
 
         # We have a SentencePiece fixture for testing
         slow_tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB)
@@ -99,10 +102,10 @@ def tearDownClass(cls):
 
     def test_image_processor_load_save_reload(self):
         # make sure load from Hub repo. -> save -> reload locally work
-        image_processor = CLIPImageProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
+        image_processor = CLIPImageProcessorFast.from_pretrained("microsoft/kosmos-2-patch14-224")
         with TemporaryDirectory() as tmp_dir:
             image_processor.save_pretrained(tmp_dir)
-            reloaded_image_processor = CLIPImageProcessor.from_pretrained(tmp_dir)
+            reloaded_image_processor = CLIPImageProcessorFast.from_pretrained(tmp_dir)
             assert image_processor.to_dict() == reloaded_image_processor.to_dict()
             assert image_processor.to_json_string() == reloaded_image_processor.to_json_string()
 
@@ -122,7 +125,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
+        self.assertIsInstance(processor.image_processor, CLIPImageProcessorFast)
 
     def test_image_processor(self):
         image_processor = self.get_image_processor()
@@ -132,8 +135,8 @@ def test_image_processor(self):
 
         image_input = self.prepare_image_inputs()
 
-        input_image_processor = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
+        input_image_processor = image_processor(image_input, return_tensors="pt")
+        input_processor = processor(images=image_input, return_tensors="pt")
 
         for key in input_image_processor:
             self.assertAlmostEqual(input_image_processor[key].sum(), input_processor[key].sum(), delta=1e-2)
@@ -189,7 +192,8 @@ def test_tokenizer_decode(self):
     def test_full_processor(self):
         url = url_to_local_path("https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/two_dogs.jpg")
 
-        processor = Kosmos2Processor.from_pretrained("microsoft/kosmos-2-patch14-224")
+        # BC with use_square_size
+        processor = Kosmos2Processor.from_pretrained("microsoft/kosmos-2-patch14-224", size=(224, 224))
 
         # test with different input formats.
         # fmt: off
@@ -395,8 +399,8 @@ def check(texts, bboxes, expected_input_ids):
             outputs.image_embeds_position_mask,
             [0] * 2 + [1] * num_image_tokens + [0] + [0] * (len(expected_input_ids[0]) - 1),
         )
-        np.testing.assert_allclose(outputs.pixel_values[0][:3, :3, :3], EXPECTED_PIXEL_VALUES_1, atol=1e-9)
-        np.testing.assert_allclose(outputs.pixel_values[0][:3, -3:, -3:], EXPECTED_PIXEL_VALUES_2, atol=1e-9)
+        np.testing.assert_allclose(outputs.pixel_values[0][:3, :3, :3].numpy(), EXPECTED_PIXEL_VALUES_1, atol=1e-4)
+        np.testing.assert_allclose(outputs.pixel_values[0][:3, -3:, -3:].numpy(), EXPECTED_PIXEL_VALUES_2, atol=1e-4)
 
         # test with image in batch (right padding)
         outputs = processor(
@@ -409,10 +413,10 @@ def check(texts, bboxes, expected_input_ids):
         )
         self.assertTupleEqual(outputs.pixel_values.shape, (4, 3, 224, 224))
         np.testing.assert_allclose(
-            outputs.pixel_values[:, :3, :3, :3].numpy(), [EXPECTED_PIXEL_VALUES_1] * len(batch_image), atol=1e-9
+            outputs.pixel_values[:, :3, :3, :3].numpy(), [EXPECTED_PIXEL_VALUES_1] * len(batch_image), atol=1e-4
         )
         np.testing.assert_allclose(
-            outputs.pixel_values[:, :3, -3:, -3:].numpy(), [EXPECTED_PIXEL_VALUES_2] * len(batch_image), atol=1e-9
+            outputs.pixel_values[:, :3, -3:, -3:].numpy(), [EXPECTED_PIXEL_VALUES_2] * len(batch_image), atol=1e-4
         )
         # padding on the right: the `[1:]` below is because the part for `BOS` is already added in the beginning of each (dynamically computed) expected value  # noqa
         # fmt: off
diff --git a/tests/models/kosmos2_5/test_processor_kosmos2_5.py b/tests/models/kosmos2_5/test_processing_kosmos2_5.py
similarity index 96%
rename from tests/models/kosmos2_5/test_processor_kosmos2_5.py
rename to tests/models/kosmos2_5/test_processing_kosmos2_5.py
index 1bc41307712c..10b2b61e45bd 100644
--- a/tests/models/kosmos2_5/test_processor_kosmos2_5.py
+++ b/tests/models/kosmos2_5/test_processing_kosmos2_5.py
@@ -25,9 +25,10 @@
 from transformers.image_utils import load_image
 from transformers.testing_utils import (
     require_torch,
+    require_torchvision,
     require_vision,
 )
-from transformers.utils import is_vision_available
+from transformers.utils import is_torchvision_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
 
@@ -38,20 +39,23 @@
     from transformers import (
         AutoProcessor,
         AutoTokenizer,
-        Kosmos2_5ImageProcessor,
         Kosmos2_5Processor,
         PreTrainedTokenizerFast,
     )
 
+if is_torchvision_available():
+    from transformers import Kosmos2_5ImageProcessorFast
+
 
 @require_vision
+@require_torchvision
 class Kosmos2_5ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Kosmos2_5Processor
     images_input_name = "flattened_patches"
 
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
-        image_processor = Kosmos2_5ImageProcessor()
+        image_processor = Kosmos2_5ImageProcessorFast()
         tokenizer = AutoTokenizer.from_pretrained("microsoft/kosmos-2.5")
         processor = Kosmos2_5Processor(image_processor, tokenizer)
         processor.save_pretrained(self.tmpdirname)
@@ -67,10 +71,10 @@ def tearDown(self):
 
     def test_image_procesor_load_save_reload(self):
         # make sure load from Hub repo. -> save -> reload locally work
-        image_processor = Kosmos2_5ImageProcessor.from_pretrained("microsoft/kosmos-2.5")
+        image_processor = Kosmos2_5ImageProcessorFast.from_pretrained("microsoft/kosmos-2.5")
         with TemporaryDirectory() as tmp_dir:
             image_processor.save_pretrained(tmp_dir)
-            reloaded_image_processor = Kosmos2_5ImageProcessor.from_pretrained(tmp_dir)
+            reloaded_image_processor = Kosmos2_5ImageProcessorFast.from_pretrained(tmp_dir)
             assert image_processor.to_dict() == reloaded_image_processor.to_dict()
             assert image_processor.to_json_string() == reloaded_image_processor.to_json_string()
 
@@ -96,7 +100,7 @@ def test_save_load_pretrained_additional_features(self):
             processor.image_processor.to_json_string(),
             image_processor_add_kwargs.to_json_string(),
         )
-        self.assertIsInstance(processor.image_processor, Kosmos2_5ImageProcessor)
+        self.assertIsInstance(processor.image_processor, Kosmos2_5ImageProcessorFast)
 
     @unittest.skip(reason="kosmos-2.5 must have both image and text")
     def test_image_processor(self):
@@ -356,12 +360,12 @@ def test_full_processor(self):
         np.testing.assert_allclose(
             outputs.flattened_patches[0][1][:10].numpy().tolist(),
             EXPECTED_FP_1,
-            atol=1e-9,
+            atol=1e-4,
         )
         np.testing.assert_allclose(
             outputs.flattened_patches[0][200][:10].numpy().tolist(),
             EXPECTED_FP_200,
-            atol=1e-9,
+            atol=1e-4,
         )
 
         # test a batch of images and texts, right padding
diff --git a/tests/models/ovis2/test_processor_ovis2.py b/tests/models/ovis2/test_processing_ovis2.py
similarity index 100%
rename from tests/models/ovis2/test_processor_ovis2.py
rename to tests/models/ovis2/test_processing_ovis2.py
diff --git a/tests/models/sam2/test_processor_sam2.py b/tests/models/sam2/test_processing_sam2.py
similarity index 100%
rename from tests/models/sam2/test_processor_sam2.py
rename to tests/models/sam2/test_processing_sam2.py
diff --git a/tests/models/sam2_video/test_processor_sam2_video.py b/tests/models/sam2_video/test_processing_sam2_video.py
similarity index 100%
rename from tests/models/sam2_video/test_processor_sam2_video.py
rename to tests/models/sam2_video/test_processing_sam2_video.py
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 295ee03a769e..ef7b7e8a157b 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -26,7 +26,11 @@
 from parameterized import parameterized
 
 from transformers.models.auto.processing_auto import processor_class_from_name
-from transformers.processing_utils import Unpack
+from transformers.processing_utils import (
+    MODALITY_TO_AUTOPROCESSOR_MAPPING,
+    SPECIAL_MODULE_TO_MODEL_NAME_MAPPING,
+    Unpack,
+)
 from transformers.testing_utils import (
     check_json_file_has_correct_format,
     require_av,
@@ -64,7 +68,6 @@
     ],
 }
 
-
 for modality, urls in MODALITY_INPUT_DATA.items():
     MODALITY_INPUT_DATA[modality] = [url_to_local_path(url) for url in urls]
 
@@ -106,13 +109,30 @@ def prepare_processor_dict():
 
     def get_component(self, attribute, **kwargs):
         assert attribute in self.processor_class.attributes
-        component_class_name = getattr(self.processor_class, f"{attribute}_class")
+        # determine from current file name
+        if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
+            attribute = "tokenizer"
+        model_name_lowercase = self.__class__.__module__.split(".")[-1].replace("test_processing_", "").split(".")[0]
+        component_class_name = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute].get(model_name_lowercase, None)
+        if component_class_name is None:
+            component_class_name = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute].get(
+                model_name_lowercase.replace("_", "-"), None
+            )
+        if component_class_name is None:
+            component_class_name = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute].get(
+                SPECIAL_MODULE_TO_MODEL_NAME_MAPPING.get(model_name_lowercase, None), None
+            )
+        if component_class_name is None:
+            raise ValueError(f"Could not find component class name for {attribute} and {model_name_lowercase}")
         if isinstance(component_class_name, tuple):
             if attribute == "image_processor":
-                # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
-                component_class_name = component_class_name[0]
+                component_class_name = (
+                    component_class_name[-1] if component_class_name[-1] else component_class_name[0]
+                )
             else:
-                component_class_name = component_class_name[-1]
+                component_class_name = (
+                    component_class_name[-1] if isinstance(component_class_name, tuple) else component_class_name
+                )
 
         component_class = processor_class_from_name(component_class_name)
         component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa