From 221f587f6c5aff30d5fb9bbf60a433b820eeccc7 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 6 Oct 2025 15:59:34 +0000 Subject: [PATCH 1/4] set common_kwargs defaults before updating with kwargs --- src/transformers/processing_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index e7786d1ba61d..6b8ff98e0e3c 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1305,6 +1305,13 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg # pass defaults to output dictionary output_kwargs.update(default_kwargs) + # For `common_kwargs` just update all modality-specific kwargs with same key/values + common_kwargs = kwargs.get("common_kwargs", {}) + common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {})) + if common_kwargs: + for kwarg in output_kwargs.values(): + kwarg.update(common_kwargs) + # update modality kwargs with passed kwargs non_modality_kwargs = set(kwargs) - set(output_kwargs) for modality, output_kwarg in output_kwargs.items(): @@ -1354,13 +1361,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." ) - # For `common_kwargs` just update all modality-specific kwargs with same key/values - common_kwargs = kwargs.get("common_kwargs", {}) - common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {})) - if common_kwargs: - for kwarg in output_kwargs.values(): - kwarg.update(common_kwargs) - return output_kwargs @classmethod From e76ddae4dbbc68037db752b82a2bdb478691f11b Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 6 Oct 2025 19:06:50 +0000 Subject: [PATCH 2/4] modify auto image procesor logic and make sure all processors use AutoImageProcessor --- src/transformers/image_processing_utils.py | 6 ++++++ src/transformers/models/align/processing_align.py | 4 ++-- .../models/auto/image_processing_auto.py | 14 ++------------ .../models/bridgetower/processing_bridgetower.py | 8 ++++---- .../models/chameleon/processing_chameleon.py | 6 +++--- src/transformers/models/emu3/processing_emu3.py | 6 +++--- src/transformers/models/flava/processing_flava.py | 6 +++--- src/transformers/models/fuyu/processing_fuyu.py | 6 +++--- .../grounding_dino/processing_grounding_dino.py | 10 +++++----- .../models/idefics/processing_idefics.py | 6 +++--- .../models/idefics2/processing_idefics2.py | 6 +++--- .../models/idefics3/processing_idefics3.py | 6 +++--- src/transformers/models/janus/processing_janus.py | 6 +++--- .../models/layoutlmv2/processing_layoutlmv2.py | 14 +++++++------- .../models/layoutlmv3/processing_layoutlmv3.py | 12 ++++++------ .../models/layoutxlm/processing_layoutxlm.py | 14 +++++++------- .../models/lfm2_vl/processing_lfm2_vl.py | 6 +++--- .../models/mllama/processing_mllama.py | 6 +++--- .../models/oneformer/processing_oneformer.py | 14 +++++++------- .../models/owlvit/processing_owlvit.py | 6 +++--- .../phi4_multimodal/processing_phi4_multimodal.py | 6 +++--- .../models/pix2struct/processing_pix2struct.py | 6 +++--- src/transformers/models/sam/processing_sam.py | 12 ++++++------ src/transformers/models/sam2/processing_sam2.py | 12 ++++++------ .../models/sam2_video/modular_sam2_video.py | 10 +++++----- .../models/sam2_video/processing_sam2_video.py | 8 ++++---- src/transformers/models/sam_hq/processing_samhq.py | 12 ++++++------ .../models/siglip2/processing_siglip2.py | 4 ++-- .../models/smolvlm/processing_smolvlm.py | 4 ++-- src/transformers/models/tvp/processing_tvp.py | 4 ++-- src/transformers/models/udop/processing_udop.py | 8 ++++---- .../models/video_llava/processing_video_llava.py | 6 +++--- src/transformers/models/vilt/processing_vilt.py | 8 ++++---- .../models/x_clip/processing_x_clip.py | 8 ++++---- 34 files changed, 133 insertions(+), 137 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 3227b08cf031..f4d8c1606602 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -41,6 +41,12 @@ class BaseImageProcessor(ImageProcessingMixin): def __init__(self, **kwargs): super().__init__(**kwargs) + if not self.is_fast: + logger.warning_once( + f"Using a slow image processor (`{self.__class__.__name__}`). " + "As we are transitioning to fast (PyTorch-native) processors, consider using `AutoImageProcessor` or the model-specific fast image processor class " + "to instantiate a fast image processor." + ) @property def is_fast(self) -> bool: diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index fbca27b2ff39..41ec7ac6a9a4 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -52,7 +52,7 @@ class AlignProcessor(ProcessorMixin): ``` Args: - image_processor ([`EfficientNetImageProcessor`]): + image_processor ([`AutoImageProcessor`]): The image processor is a required input. tokenizer ([`BertTokenizer`, `BertTokenizerFast`]): The tokenizer is a required input. @@ -60,7 +60,7 @@ class AlignProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "EfficientNetImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("BertTokenizer", "BertTokenizerFast") valid_processor_kwargs = AlignProcessorKwargs diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index eeea333aa2e8..3684dc17bac9 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -49,9 +49,6 @@ logger = logging.get_logger(__name__) -FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"] - - if TYPE_CHECKING: # This significantly improves completion suggestion performance when # the transformers package is used with Microsoft's Pylance language server. @@ -519,19 +516,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor. if use_fast is None: use_fast = image_processor_type.endswith("Fast") - if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available(): - use_fast = True + if not use_fast and is_torchvision_available(): logger.warning_once( f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. " "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. " - "Note that this behavior will be extended to all models in a future release." - ) - if not use_fast: - logger.warning_once( - "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. " - "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. " - "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`." ) + use_fast = True if use_fast and not image_processor_type.endswith("Fast"): image_processor_type += "Fast" if use_fast and not is_torchvision_available(): diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index 030c578c49cd..95aedf6fe180 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -43,19 +43,19 @@ class BridgeTowerProcessor(ProcessorMixin): Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single processor. - [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and + [`BridgeTowerProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and [`~BridgeTowerProcessor.decode`] for more information. Args: - image_processor (`BridgeTowerImageProcessor`): - An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input. + image_processor (`AutoImageProcessor`): + An instance of [`AutoImageProcessor`]. The image processor is a required input. tokenizer (`RobertaTokenizerFast`): An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "BridgeTowerImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast") valid_processor_kwargs = BridgeTowerProcessorKwargs diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 247f72322a2d..53dafc0d504e 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -55,11 +55,11 @@ class ChameleonProcessor(ProcessorMixin): Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single processor. - [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`]. + [`ChameleonProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`LlamaTokenizerFast`]. See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information. Args: - image_processor ([`ChameleonImageProcessor`]): + image_processor ([`AutoImageProcessor`]): The image processor is a required input. tokenizer ([`LlamaTokenizerFast`]): The tokenizer is a required input. @@ -71,7 +71,7 @@ class ChameleonProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") - image_processor_class = "ChameleonImageProcessor" + image_processor_class = "AutoImageProcessor" def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): self.image_seq_length = image_seq_length diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py index b7ed8e9074f0..ea3336336d32 100644 --- a/src/transformers/models/emu3/processing_emu3.py +++ b/src/transformers/models/emu3/processing_emu3.py @@ -52,11 +52,11 @@ class Emu3Processor(ProcessorMixin): Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single processor. - [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`]. + [`Emu3Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`GPT2TokenizerFast`]. See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information. Args: - image_processor ([`Emu3ImageProcessor`]): + image_processor ([`AutoImageProcessor`]): The image processor is a required input. tokenizer ([`Emu3TokenizerFast`]): The tokenizer is a required input. @@ -66,7 +66,7 @@ class Emu3Processor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast") - image_processor_class = "Emu3ImageProcessor" + image_processor_class = "AutoImageProcessor" def __init__( self, diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index 8e8a806e8615..76bba6591175 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -25,16 +25,16 @@ class FlavaProcessor(ProcessorMixin): r""" Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor. - [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the + [`FlavaProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`BertTokenizerFast`]. See the [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information. Args: - image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input. + image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "FlavaImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("BertTokenizer", "BertTokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index 75b2bbad926e..f9f64957730e 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -337,18 +337,18 @@ class FuyuProcessor(ProcessorMixin): r""" Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor. - [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`FuyuProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`LlamaTokenizerFast`]. See the [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information. Args: - image_processor ([`FuyuImageProcessor`]): + image_processor ([`AutoImageProcessor`]): The image processor is a required input. tokenizer ([`LlamaTokenizerFast`]): The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "FuyuImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer, **kwargs): diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 5f2f900451b2..4e090329869a 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -119,19 +119,19 @@ class GroundingDinoProcessor(ProcessorMixin): Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a single processor. - [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and + [`GroundingDinoProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`] for more information. Args: - image_processor (`GroundingDinoImageProcessor`): - An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input. + image_processor (`AutoImageProcessor`): + An instance of [`AutoImageProcessor`]. The image processor is a required input. tokenizer (`AutoTokenizer`): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "GroundingDinoImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" valid_processor_kwargs = GroundingDinoProcessorKwargs @@ -145,7 +145,7 @@ def __call__( **kwargs: Unpack[GroundingDinoProcessorKwargs], ) -> BatchEncoding: """ - This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and + This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model, and [`BertTokenizerFast.__call__`] to prepare text for the model. Args: diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 4b5ccaffe5c8..00344fd9bfd6 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -137,13 +137,13 @@ def is_url(string): class IdeficsProcessor(ProcessorMixin): r""" - Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor. + Constructs a IDEFICS processor which wraps a LLama tokenizer and AutoImageProcessor into a single processor. [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information. Args: - image_processor (`IdeficsImageProcessor`): + image_processor (`AutoImageProcessor`): An instance of [`IdeficsImageProcessor`]. The image processor is a required input. tokenizer (`LlamaTokenizerFast`): An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input. @@ -154,7 +154,7 @@ class IdeficsProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "IdeficsImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = "LlamaTokenizerFast" def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs): diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index c419a3641254..15954ce8d5d1 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -57,13 +57,13 @@ class Idefics2ProcessorKwargs(ProcessingKwargs, total=False): class Idefics2Processor(ProcessorMixin): r""" - Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor. + Constructs a IDEFICS2 processor which wraps a LLama tokenizer and AutoImageProcessor into a single processor. [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information. Args: - image_processor (`Idefics2ImageProcessor`): + image_processor (`AutoImageProcessor`): An instance of [`Idefics2ImageProcessor`]. The image processor is a required input. tokenizer (`PreTrainedTokenizerBase`, *optional*): An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. @@ -76,7 +76,7 @@ class Idefics2Processor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "Idefics2ImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__( diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 451af1d8a38f..09acdedb5b76 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -103,13 +103,13 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False): class Idefics3Processor(ProcessorMixin): r""" - Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor. + Constructs a Idefics3 processor which wraps a LLama tokenizer and AutoImageProcessor into a single processor. [`Idefics3Processor`] offers all the functionalities of [`Idefics3ImageProcessor`] and [`Idefics3TokenizerFast`]. See the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information. Args: - image_processor (`Idefics3ImageProcessor`): + image_processor (`AutoImageProcessor`): An instance of [`Idefics3ImageProcessor`]. The image processor is a required input. tokenizer (`PreTrainedTokenizerBase`, *optional*): An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. @@ -122,7 +122,7 @@ class Idefics3Processor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "Idefics3ImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__( diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py index 15c237c4ced4..b11e61db8e73 100644 --- a/src/transformers/models/janus/processing_janus.py +++ b/src/transformers/models/janus/processing_janus.py @@ -50,11 +50,11 @@ class JanusProcessor(ProcessorMixin): r""" Constructs a Janus processor which wraps a Janus Image Processor and a Llama tokenizer into a single processor. - [`JanusProcessor`] offers all the functionalities of [`JanusImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`JanusProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`LlamaTokenizerFast`]. See the [`~JanusProcessor.__call__`] and [`~JanusProcessor.decode`] for more information. Args: - image_processor ([`JanusImageProcessor`]): + image_processor ([`AutoImageProcessor`]): The image processor is a required input. tokenizer ([`LlamaTokenizerFast`]): The tokenizer is a required input. @@ -65,7 +65,7 @@ class JanusProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "JanusImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = "LlamaTokenizerFast" def __init__(self, image_processor, tokenizer, chat_template=None, use_default_system_prompt=False, **kwargs): diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py index 603cdf4df4e9..5e95461b6265 100644 --- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py @@ -31,21 +31,21 @@ class LayoutLMv2Processor(ProcessorMixin): [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model. - It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to + It first uses [`AutoImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token classification tasks (such as FUNSD, CORD). Args: - image_processor (`LayoutLMv2ImageProcessor`, *optional*): - An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input. + image_processor (`AutoImageProcessor`, *optional*): + An instance of [`AutoImageProcessor`]. The image processor is a required input. tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*): An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "LayoutLMv2ImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): @@ -86,10 +86,10 @@ def __call__( **kwargs, ) -> BatchEncoding: """ - This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case - [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and + This method first forwards the `images` argument to [`~AutoImageProcessor.__call__`]. In case + [`AutoImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, - together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to + together with resized `images`. In case [`AutoImageProcessor`] was initialized with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``. diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py index 1f1b6cead607..15ae7a0e3a8f 100644 --- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py @@ -31,21 +31,21 @@ class LayoutLMv3Processor(ProcessorMixin): [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model. - It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to + It first uses [`AutoImageProcessor`] to resize and normalize document images, and optionally applies OCR to get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token classification tasks (such as FUNSD, CORD). Args: - image_processor (`LayoutLMv3ImageProcessor`, *optional*): - An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input. + image_processor (`AutoImageProcessor`, *optional*): + An instance of [`AutoImageProcessor`]. The image processor is a required input. tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*): An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "LayoutLMv3ImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): @@ -86,8 +86,8 @@ def __call__( **kwargs, ) -> BatchEncoding: """ - This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case - [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and + This method first forwards the `images` argument to [`~AutoImageProcessor.__call__`]. In case + [`AutoImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py index e3ece89f434b..bb49e5291519 100644 --- a/src/transformers/models/layoutxlm/processing_layoutxlm.py +++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py @@ -31,21 +31,21 @@ class LayoutXLMProcessor(ProcessorMixin): [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model. - It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to + It first uses [`AutoImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token classification tasks (such as FUNSD, CORD). Args: - image_processor (`LayoutLMv2ImageProcessor`, *optional*): - An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input. + image_processor (`AutoImageProcessor`, *optional*): + An instance of [`AutoImageProcessor`]. The image processor is a required input. tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*): An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "LayoutLMv2ImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): @@ -85,10 +85,10 @@ def __call__( **kwargs, ) -> BatchEncoding: """ - This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case - [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and + This method first forwards the `images` argument to [`~AutoImageProcessor.__call__`]. In case + [`AutoImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, - together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to + together with resized `images`. In case [`AutoImageProcessor`] was initialized with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``. diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py index 12f289c266a1..e69f9a395e7c 100755 --- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py @@ -64,12 +64,12 @@ class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False): class Lfm2VlProcessor(ProcessorMixin): r""" - Constructs a Lfm2Vl processor which wraps a Lfm2Tokenizer tokenizer and Lfm2VlImageProcessor into a single processor. + Constructs a Lfm2Vl processor which wraps a Lfm2Tokenizer tokenizer and AutoImageProcessor into a single processor. [`Lfm2VlProcessor`] offers all the functionalities of [`Lfm2ImageProcessor`] and [`Lfm2Tokenizer`]. Args: - image_processor (`Lfm2VlImageProcessor`): + image_processor (`AutoImageProcessor`): An instance of [`Lfm2VlImageProcessor`]. The image processor is a required input. tokenizer (`PreTrainedTokenizerBase`): An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. @@ -80,7 +80,7 @@ class Lfm2VlProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "Lfm2VlImageProcessorFast" + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__( diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py index 3955006a4f9e..eda8160c2594 100644 --- a/src/transformers/models/mllama/processing_mllama.py +++ b/src/transformers/models/mllama/processing_mllama.py @@ -169,7 +169,7 @@ def build_string_from_input(prompt: str, bos_token: str, image_token: str) -> st class MllamaProcessor(ProcessorMixin): r""" Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and - [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and + [`AutoTokenizer`] into a single processor that inherits both the image processor and tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information. The preferred way of passing kwargs is as a dictionary per modality, see usage example below. @@ -189,7 +189,7 @@ class MllamaProcessor(ProcessorMixin): ``` Args: - image_processor ([`MllamaImageProcessor`]): + image_processor ([`AutoImageProcessor`]): The image processor is a required input. tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]): The tokenizer is a required input. @@ -199,7 +199,7 @@ class MllamaProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "MllamaImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = "PreTrainedTokenizerFast" def __init__(self, image_processor, tokenizer, chat_template=None): diff --git a/src/transformers/models/oneformer/processing_oneformer.py b/src/transformers/models/oneformer/processing_oneformer.py index de5a0474e26a..ae269ec0812b 100644 --- a/src/transformers/models/oneformer/processing_oneformer.py +++ b/src/transformers/models/oneformer/processing_oneformer.py @@ -31,7 +31,7 @@ class OneFormerProcessor(ProcessorMixin): tokenizer functionalities. Args: - image_processor ([`OneFormerImageProcessor`]): + image_processor ([`AutoImageProcessor`]): The image processor is a required input. tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]): The tokenizer is a required input. @@ -42,7 +42,7 @@ class OneFormerProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "OneFormerImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") def __init__( @@ -74,7 +74,7 @@ def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwar Main method to prepare for the model one or several task input(s) and image(s). This method forwards the `task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not `None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the + AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of the above two methods for more information. Args: @@ -137,7 +137,7 @@ def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwar def encode_inputs(self, images=None, task_inputs=None, segmentation_maps=None, **kwargs): """ - This method forwards all its arguments to [`OneFormerImageProcessor.encode_inputs`] and then tokenizes the + This method forwards all its arguments to [`AutoImageProcessor.encode_inputs`] and then tokenizes the task_inputs. Please refer to the docstring of this method for more information. """ @@ -177,21 +177,21 @@ def encode_inputs(self, images=None, task_inputs=None, segmentation_maps=None, * def post_process_semantic_segmentation(self, *args, **kwargs): """ - This method forwards all its arguments to [`OneFormerImageProcessor.post_process_semantic_segmentation`]. + This method forwards all its arguments to [`AutoImageProcessor.post_process_semantic_segmentation`]. Please refer to the docstring of this method for more information. """ return self.image_processor.post_process_semantic_segmentation(*args, **kwargs) def post_process_instance_segmentation(self, *args, **kwargs): """ - This method forwards all its arguments to [`OneFormerImageProcessor.post_process_instance_segmentation`]. + This method forwards all its arguments to [`AutoImageProcessor.post_process_instance_segmentation`]. Please refer to the docstring of this method for more information. """ return self.image_processor.post_process_instance_segmentation(*args, **kwargs) def post_process_panoptic_segmentation(self, *args, **kwargs): """ - This method forwards all its arguments to [`OneFormerImageProcessor.post_process_panoptic_segmentation`]. + This method forwards all its arguments to [`AutoImageProcessor.post_process_panoptic_segmentation`]. Please refer to the docstring of this method for more information. """ return self.image_processor.post_process_panoptic_segmentation(*args, **kwargs) diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index e7fb401d9a76..7569de9be876 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -60,14 +60,14 @@ class OwlViTProcessor(ProcessorMixin): [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information. Args: - image_processor ([`OwlViTImageProcessor`], *optional*): + image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`], *optional*): The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "OwlViTImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): @@ -94,7 +94,7 @@ def __call__( Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of the above two methods for more information. Args: diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py index 4a1af1d6bb78..ca9e5f4a168c 100644 --- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py @@ -46,7 +46,7 @@ class Phi4MultimodalProcessor(ProcessorMixin): [`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information. Args: - image_processor (`Phi4MultimodalImageProcessorFast`): + image_processor (`AutoImageProcessor`): The image processor to use for images. audio_processor (`Phi4MultimodalFeatureExtractor`): The audio processor to use for audio inputs. @@ -60,7 +60,7 @@ class Phi4MultimodalProcessor(ProcessorMixin): attributes = ["image_processor", "audio_processor", "tokenizer"] tokenizer_class = "GPT2TokenizerFast" - image_processor_class = "Phi4MultimodalImageProcessorFast" + image_processor_class = "AutoImageProcessor" audio_processor_class = "Phi4MultimodalFeatureExtractor" def __init__( @@ -87,7 +87,7 @@ def __call__( Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text` and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring + AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of the above two methods for more information. Args: diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py index fba2fe93ef19..c4c8f3f18d3d 100644 --- a/src/transformers/models/pix2struct/processing_pix2struct.py +++ b/src/transformers/models/pix2struct/processing_pix2struct.py @@ -55,14 +55,14 @@ class Pix2StructProcessor(ProcessorMixin): the docstring of [`~Pix2StructProcessor.__call__`] and [`~Pix2StructProcessor.decode`] for more information. Args: - image_processor (`Pix2StructImageProcessor`): + image_processor (`AutoImageProcessor`): An instance of [`Pix2StructImageProcessor`]. The image processor is a required input. tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]): An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "Pix2StructImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("T5Tokenizer", "T5TokenizerFast") def __init__(self, image_processor, tokenizer): @@ -76,7 +76,7 @@ def __call__( **kwargs: Unpack[Pix2StructProcessorKwargs], ) -> Union[BatchEncoding, BatchFeature]: """ - This method uses [`Pix2StructImageProcessor.preprocess`] method to prepare image(s) for the model, and + This method uses [`AutoImageProcessor.preprocess`] method to prepare image(s) for the model, and [`T5TokenizerFast.__call__`] to prepare text for the model. Please refer to the docstring of the above two methods for more information. diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py index bc82daf2034d..0add0c9735ec 100644 --- a/src/transformers/models/sam/processing_sam.py +++ b/src/transformers/models/sam/processing_sam.py @@ -55,16 +55,16 @@ class SamProcessor(ProcessorMixin): Constructs a SAM processor which wraps a SAM image processor and an 2D points & Bounding boxes processor into a single processor. - [`SamProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of - [`~SamImageProcessor.__call__`] for more information. + [`SamProcessor`] offers all the functionalities of [`AutoImageProcessor`]. See the docstring of + [`~AutoImageProcessor.__call__`] for more information. Args: - image_processor (`SamImageProcessor`): - An instance of [`SamImageProcessor`]. The image processor is a required input. + image_processor (`AutoImageProcessor`): + An instance of [`AutoImageProcessor`]. The image processor is a required input. """ attributes = ["image_processor"] - image_processor_class = "SamImageProcessor" + image_processor_class = "AutoImageProcessor" def __init__(self, image_processor): super().__init__(image_processor) @@ -77,7 +77,7 @@ def __call__( **kwargs, ) -> BatchEncoding: """ - This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D + This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D points and bounding boxes for the model if they are provided. """ output_kwargs = self._merge_kwargs( diff --git a/src/transformers/models/sam2/processing_sam2.py b/src/transformers/models/sam2/processing_sam2.py index 5f147aab8dfa..45ebd1b64eed 100644 --- a/src/transformers/models/sam2/processing_sam2.py +++ b/src/transformers/models/sam2/processing_sam2.py @@ -40,12 +40,12 @@ class Sam2Processor(ProcessorMixin): Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a single processor. - [`Sam2Processor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of - [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information. + [`Sam2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`Sam2VideoProcessor`]. See the docstring of + [`~AutoImageProcessor.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information. Args: - image_processor (`Sam2ImageProcessorFast`): - An instance of [`Sam2ImageProcessorFast`]. + image_processor (`AutoImageProcessor`): + An instance of [`AutoImageProcessor`]. target_size (`int`, *optional*): The target size (target_size, target_size) to which the image will be resized. point_pad_value (`int`, *optional*, defaults to -10): @@ -53,7 +53,7 @@ class Sam2Processor(ProcessorMixin): """ attributes = ["image_processor"] - image_processor_class = "Sam2ImageProcessorFast" + image_processor_class = "AutoImageProcessor" def __init__(self, image_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs): super().__init__(image_processor, **kwargs) @@ -72,7 +72,7 @@ def __call__( **kwargs, ) -> BatchEncoding: r""" - This method uses [`Sam2ImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D + This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D points and bounding boxes for the model if they are provided. Args: diff --git a/src/transformers/models/sam2_video/modular_sam2_video.py b/src/transformers/models/sam2_video/modular_sam2_video.py index 091844f0aa1c..4dcdf914c89b 100644 --- a/src/transformers/models/sam2_video/modular_sam2_video.py +++ b/src/transformers/models/sam2_video/modular_sam2_video.py @@ -606,12 +606,12 @@ class Sam2VideoProcessor(Sam2Processor): Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a single processor. - [`Sam2VideoProcessor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of - [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information. + [`Sam2VideoProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`Sam2VideoProcessor`]. See the docstring of + [`~AutoImageProcessor.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information. Args: - image_processor (`Sam2ImageProcessorFast`): - An instance of [`Sam2ImageProcessorFast`]. + image_processor (`AutoImageProcessor`): + An instance of [`AutoImageProcessor`]. video_processor (`Sam2VideoVideoProcessor`): An instance of [`Sam2VideoVideoProcessor`]. target_size (`int`, *optional*): @@ -621,7 +621,7 @@ class Sam2VideoProcessor(Sam2Processor): """ attributes = ["image_processor", "video_processor"] - image_processor_class = "Sam2ImageProcessorFast" + image_processor_class = "AutoImageProcessor" video_processor_class = "Sam2VideoVideoProcessor" def __init__( diff --git a/src/transformers/models/sam2_video/processing_sam2_video.py b/src/transformers/models/sam2_video/processing_sam2_video.py index d5a3c94d7f87..5fd79a234183 100644 --- a/src/transformers/models/sam2_video/processing_sam2_video.py +++ b/src/transformers/models/sam2_video/processing_sam2_video.py @@ -39,11 +39,11 @@ class Sam2VideoProcessor(ProcessorMixin): Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a single processor. - [`Sam2VideoProcessor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of + [`Sam2VideoProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`Sam2VideoProcessor`]. See the docstring of [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information. Args: - image_processor (`Sam2ImageProcessorFast`): + image_processor (`AutoImageProcessor`): An instance of [`Sam2ImageProcessorFast`]. video_processor (`Sam2VideoVideoProcessor`): An instance of [`Sam2VideoVideoProcessor`]. @@ -54,7 +54,7 @@ class Sam2VideoProcessor(ProcessorMixin): """ attributes = ["image_processor", "video_processor"] - image_processor_class = "Sam2ImageProcessorFast" + image_processor_class = "AutoImageProcessor" video_processor_class = "Sam2VideoVideoProcessor" def __init__( @@ -76,7 +76,7 @@ def __call__( **kwargs, ) -> BatchEncoding: r""" - This method uses [`Sam2VideoImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D + This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D points and bounding boxes for the model if they are provided. Args: diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py index 902e68832836..0281bec63489 100644 --- a/src/transformers/models/sam_hq/processing_samhq.py +++ b/src/transformers/models/sam_hq/processing_samhq.py @@ -55,16 +55,16 @@ class SamHQProcessor(ProcessorMixin): Constructs a SAM HQ processor which wraps a SAM image processor and an 2D points & Bounding boxes processor into a single processor. - [`SamHQProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of - [`~SamImageProcessor.__call__`] for more information. + [`SamHQProcessor`] offers all the functionalities of [`AutoImageProcessor`]. See the docstring of + [`~AutoImageProcessor.__call__`] for more information. Args: - image_processor (`SamImageProcessor`): - An instance of [`SamImageProcessor`]. The image processor is a required input. + image_processor (`AutoImageProcessor`): + An instance of [`AutoImageProcessor`]. The image processor is a required input. """ attributes = ["image_processor"] - image_processor_class = "SamImageProcessor" + image_processor_class = "AutoImageProcessor" def __init__(self, image_processor): super().__init__(image_processor) @@ -82,7 +82,7 @@ def __call__( **kwargs: Unpack[SamHQProcessorKwargs], ) -> BatchEncoding: """ - This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D + This method uses [`AutoImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D points and bounding boxes for the model if they are provided. """ output_kwargs = self._merge_kwargs( diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py index b16650303da4..4a9dbec65d4e 100644 --- a/src/transformers/models/siglip2/processing_siglip2.py +++ b/src/transformers/models/siglip2/processing_siglip2.py @@ -37,11 +37,11 @@ class Siglip2Processor(ProcessorMixin): r""" Constructs a Siglip2 processor which wraps a Siglip2 image processor and a Gemma tokenizer into a single processor. - [`Siglip2Processor`] offers all the functionalities of [`Siglip2ImageProcessor`] and [`GemmaTokenizerFast`]. See the + [`Siglip2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`GemmaTokenizerFast`]. See the [`~Siglip2Processor.__call__`] and [`~Siglip2Processor.decode`] for more information. Args: - image_processor ([`Siglip2ImageProcessor`]): + image_processor ([`AutoImageProcessor`]): The image processor is a required input. tokenizer ([`GemmaTokenizerFast`]): The tokenizer is a required input. diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index 86d07e238f1b..bf400dbb0c15 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -127,7 +127,7 @@ class SmolVLMProcessor(ProcessorMixin): the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information. Args: - image_processor (`SmolVLMImageProcessor`): + image_processor (`AutoImageProcessor`): An instance of [`SmolVLMImageProcessor`]. The image processor is a required input. tokenizer (`PreTrainedTokenizerBase`): An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. @@ -142,7 +142,7 @@ class SmolVLMProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer", "video_processor"] - image_processor_class = "SmolVLMImageProcessor" + image_processor_class = "AutoImageProcessor" video_processor_class = "SmolVLMVideoProcessor" # NOTE: uses different interpolation than slow processors tokenizer_class = "AutoTokenizer" diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py index 7cec0f14ab76..0bb0a265f5e7 100644 --- a/src/transformers/models/tvp/processing_tvp.py +++ b/src/transformers/models/tvp/processing_tvp.py @@ -38,14 +38,14 @@ class TvpProcessor(ProcessorMixin): [`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information. Args: - image_processor ([`TvpImageProcessor`], *optional*): + image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "TvpImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("BertTokenizer", "BertTokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py index 1be71aea63e2..9669f5eb8e2e 100644 --- a/src/transformers/models/udop/processing_udop.py +++ b/src/transformers/models/udop/processing_udop.py @@ -67,14 +67,14 @@ class UdopProcessor(ProcessorMixin): prepare labels for language modeling tasks. Args: - image_processor (`LayoutLMv3ImageProcessor`): + image_processor (`AutoImageProcessor`): An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input. tokenizer (`UdopTokenizer` or `UdopTokenizerFast`): An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "LayoutLMv3ImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast") def __init__(self, image_processor, tokenizer): @@ -88,9 +88,9 @@ def __call__( ) -> BatchFeature: """ This method first forwards the `images` argument to [`~UdopImageProcessor.__call__`]. In case - [`UdopImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and + [`AutoImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and bounding boxes along with the additional arguments to [`~UdopTokenizer.__call__`] and returns the output, - together with the prepared `pixel_values`. In case [`UdopImageProcessor`] was initialized with `apply_ocr` set + together with the prepared `pixel_values`. In case [`AutoImageProcessor`] was initialized with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional arguments to [`~UdopTokenizer.__call__`] and returns the output, together with the prepared `pixel_values`. diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index a6f826fa72a3..1506b15d458b 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -32,13 +32,13 @@ class VideoLlavaProcessor(ProcessorMixin): r""" - Constructs a VideoLlava processor which wraps a VideoLlava image processor and a Llava tokenizer into a single processor. + Constructs a VideoLlava processor which wraps a AutoImageProcessor and a Llava tokenizer into a single processor. [`VideoLlavaProcessor`] offers all the functionalities of [`VideoLlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the [`~VideoLlavaProcessor.__call__`] and [`~VideoLlavaProcessor.decode`] for more information. Args: - image_processor ([`VideoLlavaImageProcessor`], *optional*): + image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. video_processor ([`VideoLlavaVideoProcessor`], *optional*): The video processor is a required input. @@ -61,7 +61,7 @@ class VideoLlavaProcessor(ProcessorMixin): """ attributes = ["image_processor", "video_processor", "tokenizer"] - image_processor_class = "VideoLlavaImageProcessor" + image_processor_class = "AutoImageProcessor" video_processor_class = "AutoVideoProcessor" tokenizer_class = "AutoTokenizer" diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index 5b5126ad4a85..209ab1f362ed 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -38,20 +38,20 @@ class ViltProcessorKwargs(ProcessingKwargs, total=False): class ViltProcessor(ProcessorMixin): r""" - Constructs a ViLT processor which wraps a BERT tokenizer and ViLT image processor into a single processor. + Constructs a ViLT processor which wraps a BERT tokenizer and AutoImageProcessor into a single processor. [`ViltProcessor`] offers all the functionalities of [`ViltImageProcessor`] and [`BertTokenizerFast`]. See the docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information. Args: - image_processor (`ViltImageProcessor`, *optional*): - An instance of [`ViltImageProcessor`]. The image processor is a required input. + image_processor (`AutoImageProcessor`, *optional*): + An instance of [`AutoImageProcessor`]. The image processor is a required input. tokenizer (`BertTokenizerFast`, *optional*): An instance of ['BertTokenizerFast`]. The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "ViltImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("BertTokenizer", "BertTokenizerFast") valid_processor_kwargs = ViltProcessorKwargs diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py index 581dabc6d8b5..d2878c3e2018 100644 --- a/src/transformers/models/x_clip/processing_x_clip.py +++ b/src/transformers/models/x_clip/processing_x_clip.py @@ -23,20 +23,20 @@ class XCLIPProcessor(ProcessorMixin): r""" - Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor. + Constructs an X-CLIP processor which wraps a AutoImageProcessor and a CLIP tokenizer into a single processor. - [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the + [`XCLIPProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`CLIPTokenizerFast`]. See the [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information. Args: - image_processor ([`VideoMAEImageProcessor`], *optional*): + image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`CLIPTokenizerFast`], *optional*): The tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "VideoMAEImageProcessor" + image_processor_class = "AutoImageProcessor" tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): From c35a1c803e5e567b228a13c0008e132169540739 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 6 Oct 2025 19:09:51 +0000 Subject: [PATCH 3/4] fix-copies --- src/transformers/models/owlv2/processing_owlv2.py | 3 +-- src/transformers/models/sam2_video/processing_sam2_video.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 65f111e2ca79..8be195c78e10 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -84,7 +84,7 @@ def __call__( Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of the above two methods for more information. Args: @@ -151,7 +151,6 @@ def __call__( if return_tensors == "np": input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0) attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0) - elif return_tensors == "pt" and is_torch_available(): import torch diff --git a/src/transformers/models/sam2_video/processing_sam2_video.py b/src/transformers/models/sam2_video/processing_sam2_video.py index 5fd79a234183..e3e244d8570a 100644 --- a/src/transformers/models/sam2_video/processing_sam2_video.py +++ b/src/transformers/models/sam2_video/processing_sam2_video.py @@ -40,11 +40,11 @@ class Sam2VideoProcessor(ProcessorMixin): single processor. [`Sam2VideoProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`Sam2VideoProcessor`]. See the docstring of - [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information. + [`~AutoImageProcessor.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information. Args: image_processor (`AutoImageProcessor`): - An instance of [`Sam2ImageProcessorFast`]. + An instance of [`AutoImageProcessor`]. video_processor (`Sam2VideoVideoProcessor`): An instance of [`Sam2VideoVideoProcessor`]. target_size (`int`, *optional*): From 94696184d43029a30769e37215a873824aedd51f Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Wed, 15 Oct 2025 10:07:16 +0000 Subject: [PATCH 4/4] Fix more tests --- .../models/auto/tokenization_auto.py | 1 + .../models/blip/processing_blip.py | 2 +- .../models/blip_2/processing_blip_2.py | 2 +- .../chinese_clip/processing_chinese_clip.py | 4 +- .../models/clip/processing_clip.py | 2 +- .../models/clipseg/processing_clipseg.py | 4 +- .../instructblip/processing_instructblip.py | 2 +- .../models/kosmos2/processing_kosmos2.py | 2 +- src/transformers/processing_utils.py | 125 ++++++++++++------ tests/models/align/test_processing_align.py | 22 +-- tests/models/blip/test_processing_blip.py | 23 +++- tests/models/blip_2/test_processing_blip_2.py | 18 ++- .../test_processing_chinese_clip.py | 27 ++-- tests/models/clip/test_processing_clip.py | 27 ++-- .../models/clipseg/test_processing_clipseg.py | 28 ++-- tests/models/flava/test_processing_flava.py | 30 +++-- tests/models/git/test_processing_git.py | 19 ++- ...ssor_glm4v.py => test_processing_glm4v.py} | 0 .../test_processing_grounding_dino.py | 24 +++- .../test_processing_instructblip.py | 17 ++- .../models/kosmos2/test_processing_kosmos2.py | 44 +++--- ...mos2_5.py => test_processing_kosmos2_5.py} | 20 +-- ...ssor_ovis2.py => test_processing_ovis2.py} | 0 ...cessor_sam2.py => test_processing_sam2.py} | 0 ...video.py => test_processing_sam2_video.py} | 0 tests/test_processing_common.py | 32 ++++- 26 files changed, 313 insertions(+), 162 deletions(-) rename tests/models/glm4v/{test_processor_glm4v.py => test_processing_glm4v.py} (100%) rename tests/models/kosmos2_5/{test_processor_kosmos2_5.py => test_processing_kosmos2_5.py} (96%) rename tests/models/ovis2/{test_processor_ovis2.py => test_processing_ovis2.py} (100%) rename tests/models/sam2/{test_processor_sam2.py => test_processing_sam2.py} (100%) rename tests/models/sam2_video/{test_processor_sam2_video.py => test_processing_sam2_video.py} (100%) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index ccee9937afa6..35dd6960ed37 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -253,6 +253,7 @@ ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None), ), ("flaubert", ("FlaubertTokenizer", None)), + ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), ("fsmt", ("FSMTTokenizer", None)), diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index f600e8ce27d8..a54436bcafd6 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -54,7 +54,7 @@ class BlipProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") + image_processor_class = "AutoImageProcessor" tokenizer_class = ("BertTokenizer", "BertTokenizerFast") def __init__(self, image_processor, tokenizer, **kwargs): diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 40729f4f4501..4382cc3cfaa0 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -61,7 +61,7 @@ class Blip2Processor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py index 0510b9b0f3c9..7c6bd572deeb 100644 --- a/src/transformers/models/chinese_clip/processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py @@ -35,8 +35,8 @@ class ChineseCLIPProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast") - tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py index 7b856f9981ee..fe56058996e4 100644 --- a/src/transformers/models/clip/processing_clip.py +++ b/src/transformers/models/clip/processing_clip.py @@ -34,7 +34,7 @@ class CLIPProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast") + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__(self, image_processor=None, tokenizer=None, **kwargs): diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index 39e091106c71..4a8ee28f65f5 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -35,8 +35,8 @@ class CLIPSegProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast") - tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index afe43c1fc7a7..8b749aa21d99 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -66,7 +66,7 @@ class InstructBlipProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer", "qformer_tokenizer"] - image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" qformer_tokenizer_class = "AutoTokenizer" diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index f9fb98df6ac2..5db8d32b9fe2 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -86,7 +86,7 @@ class Kosmos2Processor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast") + image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs): diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 55844c8d9cce..e96fe5c4767e 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -36,24 +36,6 @@ from .dynamic_module_utils import custom_object_save from .feature_extraction_utils import BatchFeature from .image_utils import ChannelDimension, ImageInput, is_vision_available -from .utils.chat_template_utils import render_jinja_template -from .utils.type_validators import ( - device_validator, - image_size_validator, - padding_validator, - positive_any_number, - positive_int, - resampling_validator, - tensor_type_validator, - truncation_validator, - video_metadata_validator, -) -from .video_utils import VideoInput, VideoMetadataType - - -if is_vision_available(): - from .image_utils import PILImageResampling - from .tokenization_utils_base import ( PaddingStrategy, PreTokenizedInput, @@ -79,12 +61,27 @@ list_repo_templates, logging, ) +from .utils.chat_template_utils import render_jinja_template from .utils.deprecation import deprecate_kwarg +from .utils.type_validators import ( + device_validator, + image_size_validator, + padding_validator, + positive_any_number, + positive_int, + resampling_validator, + tensor_type_validator, + truncation_validator, + video_metadata_validator, +) +from .video_utils import VideoInput, VideoMetadataType if is_torch_available(): from .modeling_utils import PreTrainedAudioTokenizerBase +if is_vision_available(): + from .image_utils import PILImageResampling logger = logging.get_logger(__name__) @@ -95,6 +92,38 @@ transformers_module = direct_transformers_import(Path(__file__).parent) +class _LazyAutoProcessorMapping(dict): + """ + Lazy dictionary to avoid circular imports. + The mapping names are only imported when accessed. + """ + + _MAPPING_NAMES = { + "image_processor": ("transformers.models.auto.image_processing_auto", "IMAGE_PROCESSOR_MAPPING_NAMES"), + "video_processor": ("transformers.models.auto.video_processing_auto", "VIDEO_PROCESSOR_MAPPING_NAMES"), + "feature_extractor": ( + "transformers.models.auto.feature_extraction_auto", + "FEATURE_EXTRACTOR_MAPPING_NAMES", + ), + "tokenizer": ("transformers.models.auto.tokenization_auto", "TOKENIZER_MAPPING_NAMES"), + } + + def __getitem__(self, key): + if key not in self._MAPPING_NAMES: + raise KeyError(key) + module_name, attr_name = self._MAPPING_NAMES[key] + module = __import__(module_name, fromlist=[attr_name]) + return getattr(module, attr_name) + + def __contains__(self, key): + return key in self._MAPPING_NAMES + + def keys(self): + return self._MAPPING_NAMES.keys() + + +MODALITY_TO_AUTOPROCESSOR_MAPPING = _LazyAutoProcessorMapping() + AUTO_TO_BASE_CLASS_MAPPING = { "AutoTokenizer": "PreTrainedTokenizerBase", "AutoFeatureExtractor": "FeatureExtractionMixin", @@ -102,6 +131,11 @@ "AutoVideoProcessor": "BaseVideoProcessor", } +SPECIAL_MODULE_TO_MODEL_NAME_MAPPING = { + "kosmos2_5": "kosmos-2.5", + "kosmos2": "kosmos-2", +} + if sys.version_info >= (3, 11): Unpack = typing.Unpack else: @@ -1497,30 +1531,45 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs) via methods like `AutoTokenizer.register()`. If neither of these conditions are fulfilled, this method will be unable to find the relevant subcomponent class and will raise an error. """ + # Lazy import to avoid circular imports + args = [] - for attribute_name in cls.attributes: - class_name = getattr(cls, f"{attribute_name}_class") - if isinstance(class_name, tuple): - classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name) - if attribute_name == "image_processor": - # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default) - use_fast = kwargs.get("use_fast") - if use_fast is None: - logger.warning_once( - "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. " - "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. " - "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`." - ) - else: + # get args from processor init signature + model_name_lowercase = cls.__module__.split(".")[-1].replace("processing_", "").split(".")[0] + sub_processors = inspect.signature(cls.__init__).parameters.keys() + for sub_processor_type in sub_processors: + if sub_processor_type not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in sub_processor_type: + sub_processor_type = "tokenizer" + if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING: + sub_processor_names = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type].get( + model_name_lowercase, None + ) + if sub_processor_names is None: + sub_processor_names = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type].get( + model_name_lowercase.replace("_", "-"), None + ) + if sub_processor_names is None: + sub_processor_names = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type].get( + SPECIAL_MODULE_TO_MODEL_NAME_MAPPING.get(model_name_lowercase, None), None + ) + if sub_processor_names is None: + raise ValueError( + f"Could not find component class name for {sub_processor_type} and {model_name_lowercase}" + ) + if isinstance(sub_processor_names, tuple): use_fast = kwargs.get("use_fast", True) - if use_fast and classes[1] is not None: - attribute_class = classes[1] + if use_fast and sub_processor_names[1] is not None: + sub_processor_name = sub_processor_names[1] + else: + sub_processor_name = sub_processor_names[0] else: - attribute_class = classes[0] - else: - attribute_class = cls.get_possibly_dynamic_module(class_name) + sub_processor_name = sub_processor_names - args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) + if hasattr(transformers_module, sub_processor_name): + sub_processor_class = getattr(transformers_module, sub_processor_name) + args.append(sub_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) + else: + raise ValueError(f"Could not find module {sub_processor_name} in `transformers`.") return args diff --git a/tests/models/align/test_processing_align.py b/tests/models/align/test_processing_align.py index 0adfc5a82205..8b2979456d7c 100644 --- a/tests/models/align/test_processing_align.py +++ b/tests/models/align/test_processing_align.py @@ -23,13 +23,15 @@ from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision -from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from transformers.utils import IMAGE_PROCESSOR_NAME, is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): from transformers import AlignProcessor, EfficientNetImageProcessor +if is_torchvision_available(): + from transformers import EfficientNetImageProcessorFast @require_vision @@ -80,6 +82,9 @@ def get_rust_tokenizer(self, **kwargs): def get_image_processor(self, **kwargs): return EfficientNetImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + def get_image_processor_fast(self, **kwargs): + return EfficientNetImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs) + def tearDown(self): shutil.rmtree(self.tmpdirname) @@ -87,12 +92,13 @@ def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() image_processor = self.get_image_processor() + image_processor_fast = self.get_image_processor_fast() processor_slow = AlignProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) processor_slow.save_pretrained(self.tmpdirname) processor_slow = AlignProcessor.from_pretrained(self.tmpdirname, use_fast=False) - processor_fast = AlignProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast = AlignProcessor(tokenizer=tokenizer_fast, image_processor=image_processor_fast) processor_fast.save_pretrained(self.tmpdirname) processor_fast = AlignProcessor.from_pretrained(self.tmpdirname) @@ -103,16 +109,16 @@ def test_save_load_pretrained_default(self): self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string()) self.assertIsInstance(processor_slow.image_processor, EfficientNetImageProcessor) - self.assertIsInstance(processor_fast.image_processor, EfficientNetImageProcessor) + self.assertIsInstance(processor_fast.image_processor, EfficientNetImageProcessorFast) def test_save_load_pretrained_additional_features(self): processor = AlignProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) processor.save_pretrained(self.tmpdirname) tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False, padding_value=1.0) processor = AlignProcessor.from_pretrained( self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 @@ -122,7 +128,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, BertTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, EfficientNetImageProcessor) + self.assertIsInstance(processor.image_processor, EfficientNetImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -132,8 +138,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_image_proc = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_image_proc = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_image_proc: self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/blip/test_processing_blip.py b/tests/models/blip/test_processing_blip.py index d9f045332ed3..a807b210e52d 100644 --- a/tests/models/blip/test_processing_blip.py +++ b/tests/models/blip/test_processing_blip.py @@ -17,17 +17,26 @@ import pytest -from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_vision_available +from transformers.testing_utils import require_torch, require_torchvision, require_vision +from transformers.utils import is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): - from transformers import AutoProcessor, BertTokenizer, BlipImageProcessor, BlipProcessor, PreTrainedTokenizerFast + from transformers import ( + AutoProcessor, + BertTokenizer, + BlipProcessor, + PreTrainedTokenizerFast, + ) + +if is_torchvision_available(): + from transformers import BlipImageProcessorFast @require_vision +@require_torchvision class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = BlipProcessor @@ -35,7 +44,7 @@ class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): def setUpClass(cls): cls.tmpdirname = tempfile.mkdtemp() - image_processor = BlipImageProcessor() + image_processor = BlipImageProcessorFast() tokenizer = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel") processor = BlipProcessor(image_processor, tokenizer) @@ -68,7 +77,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, BlipImageProcessor) + self.assertIsInstance(processor.image_processor, BlipImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -78,8 +87,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_feat_extract = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/blip_2/test_processing_blip_2.py b/tests/models/blip_2/test_processing_blip_2.py index e5c17a11ce02..1a4d7f3894aa 100644 --- a/tests/models/blip_2/test_processing_blip_2.py +++ b/tests/models/blip_2/test_processing_blip_2.py @@ -17,17 +17,21 @@ import pytest -from transformers.testing_utils import require_vision -from transformers.utils import is_vision_available +from transformers.testing_utils import require_torchvision, require_vision +from transformers.utils import is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): - from transformers import AutoProcessor, Blip2Processor, BlipImageProcessor, GPT2Tokenizer, PreTrainedTokenizerFast + from transformers import AutoProcessor, Blip2Processor, GPT2Tokenizer, PreTrainedTokenizerFast + +if is_torchvision_available(): + from transformers import BlipImageProcessorFast @require_vision +@require_torchvision class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Blip2Processor @@ -35,7 +39,7 @@ class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def setUpClass(cls): cls.tmpdirname = tempfile.mkdtemp() - image_processor = BlipImageProcessor() + image_processor = BlipImageProcessorFast() tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") processor = Blip2Processor(image_processor, tokenizer) @@ -71,7 +75,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, BlipImageProcessor) + self.assertIsInstance(processor.image_processor, BlipImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -81,8 +85,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_feat_extract = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/chinese_clip/test_processing_chinese_clip.py b/tests/models/chinese_clip/test_processing_chinese_clip.py index 5aef3d06c15b..9407b503ce79 100644 --- a/tests/models/chinese_clip/test_processing_chinese_clip.py +++ b/tests/models/chinese_clip/test_processing_chinese_clip.py @@ -22,8 +22,8 @@ from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_vision -from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available +from transformers.testing_utils import require_torchvision, require_vision +from transformers.utils import FEATURE_EXTRACTOR_NAME, is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -31,8 +31,12 @@ if is_vision_available(): from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor +if is_torchvision_available(): + from transformers import ChineseCLIPImageProcessorFast + @require_vision +@require_torchvision class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = ChineseCLIPProcessor @@ -95,6 +99,10 @@ def get_rust_tokenizer(cls, **kwargs): def get_image_processor(cls, **kwargs): return ChineseCLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) + @classmethod + def get_image_processor_fast(cls, **kwargs): + return ChineseCLIPImageProcessorFast.from_pretrained(cls.tmpdirname, **kwargs) + @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) @@ -103,13 +111,14 @@ def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() image_processor = self.get_image_processor() + image_processor_fast = self.get_image_processor_fast() with tempfile.TemporaryDirectory() as tmpdir: processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) processor_slow.save_pretrained(tmpdir) processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False) - processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor_fast) processor_fast.save_pretrained(tmpdir) processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname) @@ -120,9 +129,9 @@ def test_save_load_pretrained_default(self): self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string()) self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor) - self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor) + self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessorFast) def test_save_load_pretrained_additional_features(self): with tempfile.TemporaryDirectory() as tmpdir: @@ -132,7 +141,7 @@ def test_save_load_pretrained_additional_features(self): processor.save_pretrained(tmpdir) tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False) + image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False) processor = ChineseCLIPProcessor.from_pretrained( tmpdir, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False @@ -142,7 +151,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, BertTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor) + self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -152,8 +161,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_feat_extract = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/clip/test_processing_clip.py b/tests/models/clip/test_processing_clip.py index 6ca9a47b29c7..a9d38bf6703f 100644 --- a/tests/models/clip/test_processing_clip.py +++ b/tests/models/clip/test_processing_clip.py @@ -18,9 +18,9 @@ import pytest -from transformers import AutoTokenizer, CLIPTokenizer, CLIPTokenizerFast -from transformers.testing_utils import require_vision -from transformers.utils import is_vision_available +from transformers import AutoImageProcessor, AutoTokenizer, CLIPTokenizer, CLIPTokenizerFast +from transformers.testing_utils import require_torchvision, require_vision +from transformers.utils import is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -28,11 +28,15 @@ if is_vision_available(): from transformers import CLIPImageProcessor, CLIPProcessor +if is_torchvision_available(): + from transformers import CLIPImageProcessorFast + TEST_MODEL_PATH = "openai/clip-vit-base-patch32" @require_vision +@require_torchvision class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = CLIPProcessor @@ -40,7 +44,7 @@ class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): def setUpClass(cls): cls.tmpdirname = tempfile.mkdtemp() tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_PATH) - image_processor = CLIPImageProcessor.from_pretrained(TEST_MODEL_PATH) + image_processor = AutoImageProcessor.from_pretrained(TEST_MODEL_PATH) processor = CLIPProcessor( image_processor=image_processor, tokenizer=tokenizer, @@ -59,6 +63,10 @@ def get_rust_tokenizer(cls, **kwargs): def get_image_processor(cls, **kwargs): return CLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) + @classmethod + def get_image_processor_fast(cls, **kwargs): + return CLIPImageProcessorFast.from_pretrained(cls.tmpdirname, **kwargs) + @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname) @@ -67,6 +75,7 @@ def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() image_processor = self.get_image_processor() + image_processor_fast = self.get_image_processor_fast() with tempfile.TemporaryDirectory() as tmpdir: processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) @@ -84,17 +93,17 @@ def test_save_load_pretrained_default(self): self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string()) self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor) - self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor) + self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessorFast) def test_save_load_pretrained_additional_features(self): with tempfile.TemporaryDirectory() as tmpdir: - processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor_fast()) processor.save_pretrained(tmpdir) tokenizer_add_kwargs = CLIPTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = CLIPImageProcessor.from_pretrained( + image_processor_add_kwargs = CLIPImageProcessorFast.from_pretrained( tmpdir, do_normalize=False, padding_value=1.0 ) @@ -106,7 +115,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, CLIPImageProcessor) + self.assertIsInstance(processor.image_processor, CLIPImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() diff --git a/tests/models/clipseg/test_processing_clipseg.py b/tests/models/clipseg/test_processing_clipseg.py index f7255838caa8..2f6a492cd408 100644 --- a/tests/models/clipseg/test_processing_clipseg.py +++ b/tests/models/clipseg/test_processing_clipseg.py @@ -22,8 +22,8 @@ from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES -from transformers.testing_utils import require_vision -from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from transformers.testing_utils import require_torchvision, require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -31,8 +31,12 @@ if is_vision_available(): from transformers import CLIPSegProcessor, ViTImageProcessor +if is_torchvision_available(): + from transformers import ViTImageProcessorFast + @require_vision +@require_torchvision class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = CLIPSegProcessor @@ -73,6 +77,9 @@ def get_rust_tokenizer(self, **kwargs): def get_image_processor(self, **kwargs): return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + def get_image_processor_fast(self, **kwargs): + return ViTImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs) + def tearDown(self): shutil.rmtree(self.tmpdirname) @@ -80,12 +87,13 @@ def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() image_processor = self.get_image_processor() + image_processor_fast = self.get_image_processor_fast() processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) processor_slow.save_pretrained(self.tmpdirname) processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False) - processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor_fast) processor_fast.save_pretrained(self.tmpdirname) processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname) @@ -96,16 +104,16 @@ def test_save_load_pretrained_default(self): self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string()) self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor) - self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor) + self.assertIsInstance(processor_fast.image_processor, ViTImageProcessorFast) def test_save_load_pretrained_additional_features(self): - processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor_fast()) processor.save_pretrained(self.tmpdirname) tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False, padding_value=1.0) processor = CLIPSegProcessor.from_pretrained( self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 @@ -115,7 +123,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, ViTImageProcessor) + self.assertIsInstance(processor.image_processor, ViTImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -125,8 +133,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_feat_extract = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/flava/test_processing_flava.py b/tests/models/flava/test_processing_flava.py index 10a00a869915..7500ac13707e 100644 --- a/tests/models/flava/test_processing_flava.py +++ b/tests/models/flava/test_processing_flava.py @@ -23,8 +23,8 @@ from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_vision -from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from transformers.testing_utils import require_torchvision, require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -38,8 +38,12 @@ FLAVA_IMAGE_STD, ) +if is_torchvision_available(): + from transformers import FlavaImageProcessorFast + @require_vision +@require_torchvision class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = FlavaProcessor @@ -89,6 +93,9 @@ def get_rust_tokenizer(self, **kwargs): def get_image_processor(self, **kwargs): return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + def get_image_processor_fast(self, **kwargs): + return FlavaImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs) + def tearDown(self): shutil.rmtree(self.tmpdirname) @@ -96,12 +103,13 @@ def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() image_processor = self.get_image_processor() + image_processor_fast = self.get_image_processor_fast() processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) processor_slow.save_pretrained(self.tmpdirname) processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False) - processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor_fast) processor_fast.save_pretrained(self.tmpdirname) processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname) @@ -112,16 +120,16 @@ def test_save_load_pretrained_default(self): self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string()) self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor) - self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor) + self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessorFast) def test_save_load_pretrained_additional_features(self): processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) processor.save_pretrained(self.tmpdirname) tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False, padding_value=1.0) processor = FlavaProcessor.from_pretrained( self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 @@ -131,7 +139,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, BertTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, FlavaImageProcessor) + self.assertIsInstance(processor.image_processor, FlavaImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -141,8 +149,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_feat_extract = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) @@ -150,11 +158,11 @@ def test_image_processor(self): # With rest of the args random.seed(1234) input_feat_extract = image_processor( - image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np" + image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="pt" ) random.seed(1234) input_processor = processor( - images=image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np" + images=image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="pt" ) for key in input_feat_extract: diff --git a/tests/models/git/test_processing_git.py b/tests/models/git/test_processing_git.py index 5e06636007bc..a1842ffaaa81 100644 --- a/tests/models/git/test_processing_git.py +++ b/tests/models/git/test_processing_git.py @@ -17,8 +17,8 @@ import pytest -from transformers.testing_utils import require_vision -from transformers.utils import is_vision_available +from transformers.testing_utils import require_torchvision, require_vision +from transformers.utils import is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -26,8 +26,12 @@ if is_vision_available(): from transformers import AutoProcessor, BertTokenizer, CLIPImageProcessor, GitProcessor, PreTrainedTokenizerFast +if is_torchvision_available(): + from transformers import CLIPImageProcessorFast + @require_vision +@require_torchvision class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = GitProcessor @@ -50,6 +54,9 @@ def get_tokenizer(self, **kwargs): def get_image_processor(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + def get_image_processor_fast(self, **kwargs): + return CLIPImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) @@ -60,7 +67,7 @@ def test_save_load_pretrained_additional_features(self): processor.save_pretrained(tmpdir) tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + image_processor_add_kwargs = self.get_image_processor_fast(do_normalize=False, padding_value=1.0) processor = GitProcessor.from_pretrained( tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 @@ -70,7 +77,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, CLIPImageProcessor) + self.assertIsInstance(processor.image_processor, CLIPImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -80,8 +87,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_feat_extract = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/glm4v/test_processor_glm4v.py b/tests/models/glm4v/test_processing_glm4v.py similarity index 100% rename from tests/models/glm4v/test_processor_glm4v.py rename to tests/models/glm4v/test_processing_glm4v.py diff --git a/tests/models/grounding_dino/test_processing_grounding_dino.py b/tests/models/grounding_dino/test_processing_grounding_dino.py index 088f240eee73..46cac4c97893 100644 --- a/tests/models/grounding_dino/test_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_processing_grounding_dino.py @@ -23,8 +23,8 @@ from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_torch, require_vision -from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_torchvision, require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -38,8 +38,13 @@ from transformers import GroundingDinoImageProcessor +if is_torchvision_available(): + from transformers import GroundingDinoImageProcessorFast + + @require_torch @require_vision +@require_torchvision class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase): from_pretrained_id = "IDEA-Research/grounding-dino-base" processor_class = GroundingDinoProcessor @@ -108,6 +113,10 @@ def get_rust_tokenizer(cls, **kwargs): def get_image_processor(cls, **kwargs): return GroundingDinoImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) + # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.get_image_processor_fast with CLIP->GroundingDino + def get_image_processor_fast(cls, **kwargs): + return GroundingDinoImageProcessorFast.from_pretrained(cls.tmpdirname, **kwargs) + @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) @@ -150,6 +159,7 @@ def test_save_load_pretrained_default(self): tokenizer_slow = self.get_tokenizer() tokenizer_fast = self.get_rust_tokenizer() image_processor = self.get_image_processor() + image_processor_fast = self.get_image_processor_fast() with tempfile.TemporaryDirectory() as tmpdir: processor_slow = GroundingDinoProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) @@ -167,20 +177,20 @@ def test_save_load_pretrained_default(self): self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor_fast.to_json_string()) self.assertIsInstance(processor_slow.image_processor, GroundingDinoImageProcessor) - self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessor) + self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessorFast) # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer def test_save_load_pretrained_additional_features(self): with tempfile.TemporaryDirectory() as tmpdir: processor = GroundingDinoProcessor( - tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor() + tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor_fast() ) processor.save_pretrained(tmpdir) tokenizer_add_kwargs = BertTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)") - image_processor_add_kwargs = GroundingDinoImageProcessor.from_pretrained( + image_processor_add_kwargs = GroundingDinoImageProcessorFast.from_pretrained( tmpdir, do_normalize=False, padding_value=1.0 ) @@ -192,7 +202,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, BertTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessor) + self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessorFast) # Copied from tests.models.clip.test_processing_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDino def test_image_processor(self): diff --git a/tests/models/instructblip/test_processing_instructblip.py b/tests/models/instructblip/test_processing_instructblip.py index 019fe85f72e1..0ec587b48bad 100644 --- a/tests/models/instructblip/test_processing_instructblip.py +++ b/tests/models/instructblip/test_processing_instructblip.py @@ -17,8 +17,8 @@ import pytest -from transformers.testing_utils import require_vision -from transformers.utils import is_vision_available +from transformers.testing_utils import require_torchvision, require_vision +from transformers.utils import is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -27,14 +27,17 @@ from transformers import ( AutoProcessor, BertTokenizerFast, - BlipImageProcessor, GPT2Tokenizer, InstructBlipProcessor, PreTrainedTokenizerFast, ) +if is_torchvision_available(): + from transformers import BlipImageProcessorFast + @require_vision +@require_torchvision class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = InstructBlipProcessor @@ -42,7 +45,7 @@ class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): def setUpClass(cls): cls.tmpdirname = tempfile.mkdtemp() - image_processor = BlipImageProcessor() + image_processor = BlipImageProcessorFast() tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert") @@ -86,7 +89,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, BlipImageProcessor) + self.assertIsInstance(processor.image_processor, BlipImageProcessorFast) self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast) def test_image_processor(self): @@ -104,8 +107,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_feat_extract = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_feat_extract: self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) diff --git a/tests/models/kosmos2/test_processing_kosmos2.py b/tests/models/kosmos2/test_processing_kosmos2.py index c2c98882ef02..a21c139619fe 100644 --- a/tests/models/kosmos2/test_processing_kosmos2.py +++ b/tests/models/kosmos2/test_processing_kosmos2.py @@ -21,6 +21,13 @@ import numpy as np import pytest +from transformers import ( + AutoProcessor, + Kosmos2Processor, + PreTrainedTokenizerFast, + XLMRobertaTokenizer, + XLMRobertaTokenizerFast, +) from transformers.image_utils import load_image from transformers.models.auto.processing_auto import processor_class_from_name from transformers.testing_utils import ( @@ -28,9 +35,10 @@ require_sentencepiece, require_tokenizers, require_torch, + require_torchvision, require_vision, ) -from transformers.utils import is_vision_available +from transformers.utils import is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin, url_to_local_path @@ -38,15 +46,9 @@ if is_vision_available(): from PIL import Image - from transformers import ( - AutoProcessor, - CLIPImageProcessor, - Kosmos2Processor, - PreTrainedTokenizerFast, - XLMRobertaTokenizer, - XLMRobertaTokenizerFast, - ) +if is_torchvision_available(): + from transformers import CLIPImageProcessorFast SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @@ -54,6 +56,7 @@ @require_sentencepiece @require_tokenizers @require_vision +@require_torchvision class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Kosmos2Processor @@ -61,7 +64,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): def setUpClass(cls): cls.tmpdirname = tempfile.mkdtemp() - image_processor = CLIPImageProcessor(do_center_crop=False) + image_processor = CLIPImageProcessorFast(do_center_crop=False) # We have a SentencePiece fixture for testing slow_tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB) @@ -99,10 +102,10 @@ def tearDownClass(cls): def test_image_processor_load_save_reload(self): # make sure load from Hub repo. -> save -> reload locally work - image_processor = CLIPImageProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") + image_processor = CLIPImageProcessorFast.from_pretrained("microsoft/kosmos-2-patch14-224") with TemporaryDirectory() as tmp_dir: image_processor.save_pretrained(tmp_dir) - reloaded_image_processor = CLIPImageProcessor.from_pretrained(tmp_dir) + reloaded_image_processor = CLIPImageProcessorFast.from_pretrained(tmp_dir) assert image_processor.to_dict() == reloaded_image_processor.to_dict() assert image_processor.to_json_string() == reloaded_image_processor.to_json_string() @@ -122,7 +125,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, CLIPImageProcessor) + self.assertIsInstance(processor.image_processor, CLIPImageProcessorFast) def test_image_processor(self): image_processor = self.get_image_processor() @@ -132,8 +135,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_image_processor = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_image_processor = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_image_processor: self.assertAlmostEqual(input_image_processor[key].sum(), input_processor[key].sum(), delta=1e-2) @@ -189,7 +192,8 @@ def test_tokenizer_decode(self): def test_full_processor(self): url = url_to_local_path("https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/two_dogs.jpg") - processor = Kosmos2Processor.from_pretrained("microsoft/kosmos-2-patch14-224") + # BC with use_square_size + processor = Kosmos2Processor.from_pretrained("microsoft/kosmos-2-patch14-224", size=(224, 224)) # test with different input formats. # fmt: off @@ -395,8 +399,8 @@ def check(texts, bboxes, expected_input_ids): outputs.image_embeds_position_mask, [0] * 2 + [1] * num_image_tokens + [0] + [0] * (len(expected_input_ids[0]) - 1), ) - np.testing.assert_allclose(outputs.pixel_values[0][:3, :3, :3], EXPECTED_PIXEL_VALUES_1, atol=1e-9) - np.testing.assert_allclose(outputs.pixel_values[0][:3, -3:, -3:], EXPECTED_PIXEL_VALUES_2, atol=1e-9) + np.testing.assert_allclose(outputs.pixel_values[0][:3, :3, :3].numpy(), EXPECTED_PIXEL_VALUES_1, atol=1e-4) + np.testing.assert_allclose(outputs.pixel_values[0][:3, -3:, -3:].numpy(), EXPECTED_PIXEL_VALUES_2, atol=1e-4) # test with image in batch (right padding) outputs = processor( @@ -409,10 +413,10 @@ def check(texts, bboxes, expected_input_ids): ) self.assertTupleEqual(outputs.pixel_values.shape, (4, 3, 224, 224)) np.testing.assert_allclose( - outputs.pixel_values[:, :3, :3, :3].numpy(), [EXPECTED_PIXEL_VALUES_1] * len(batch_image), atol=1e-9 + outputs.pixel_values[:, :3, :3, :3].numpy(), [EXPECTED_PIXEL_VALUES_1] * len(batch_image), atol=1e-4 ) np.testing.assert_allclose( - outputs.pixel_values[:, :3, -3:, -3:].numpy(), [EXPECTED_PIXEL_VALUES_2] * len(batch_image), atol=1e-9 + outputs.pixel_values[:, :3, -3:, -3:].numpy(), [EXPECTED_PIXEL_VALUES_2] * len(batch_image), atol=1e-4 ) # padding on the right: the `[1:]` below is because the part for `BOS` is already added in the beginning of each (dynamically computed) expected value # noqa # fmt: off diff --git a/tests/models/kosmos2_5/test_processor_kosmos2_5.py b/tests/models/kosmos2_5/test_processing_kosmos2_5.py similarity index 96% rename from tests/models/kosmos2_5/test_processor_kosmos2_5.py rename to tests/models/kosmos2_5/test_processing_kosmos2_5.py index 1bc41307712c..10b2b61e45bd 100644 --- a/tests/models/kosmos2_5/test_processor_kosmos2_5.py +++ b/tests/models/kosmos2_5/test_processing_kosmos2_5.py @@ -25,9 +25,10 @@ from transformers.image_utils import load_image from transformers.testing_utils import ( require_torch, + require_torchvision, require_vision, ) -from transformers.utils import is_vision_available +from transformers.utils import is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin, url_to_local_path @@ -38,20 +39,23 @@ from transformers import ( AutoProcessor, AutoTokenizer, - Kosmos2_5ImageProcessor, Kosmos2_5Processor, PreTrainedTokenizerFast, ) +if is_torchvision_available(): + from transformers import Kosmos2_5ImageProcessorFast + @require_vision +@require_torchvision class Kosmos2_5ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Kosmos2_5Processor images_input_name = "flattened_patches" def setUp(self): self.tmpdirname = tempfile.mkdtemp() - image_processor = Kosmos2_5ImageProcessor() + image_processor = Kosmos2_5ImageProcessorFast() tokenizer = AutoTokenizer.from_pretrained("microsoft/kosmos-2.5") processor = Kosmos2_5Processor(image_processor, tokenizer) processor.save_pretrained(self.tmpdirname) @@ -67,10 +71,10 @@ def tearDown(self): def test_image_procesor_load_save_reload(self): # make sure load from Hub repo. -> save -> reload locally work - image_processor = Kosmos2_5ImageProcessor.from_pretrained("microsoft/kosmos-2.5") + image_processor = Kosmos2_5ImageProcessorFast.from_pretrained("microsoft/kosmos-2.5") with TemporaryDirectory() as tmp_dir: image_processor.save_pretrained(tmp_dir) - reloaded_image_processor = Kosmos2_5ImageProcessor.from_pretrained(tmp_dir) + reloaded_image_processor = Kosmos2_5ImageProcessorFast.from_pretrained(tmp_dir) assert image_processor.to_dict() == reloaded_image_processor.to_dict() assert image_processor.to_json_string() == reloaded_image_processor.to_json_string() @@ -96,7 +100,7 @@ def test_save_load_pretrained_additional_features(self): processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string(), ) - self.assertIsInstance(processor.image_processor, Kosmos2_5ImageProcessor) + self.assertIsInstance(processor.image_processor, Kosmos2_5ImageProcessorFast) @unittest.skip(reason="kosmos-2.5 must have both image and text") def test_image_processor(self): @@ -356,12 +360,12 @@ def test_full_processor(self): np.testing.assert_allclose( outputs.flattened_patches[0][1][:10].numpy().tolist(), EXPECTED_FP_1, - atol=1e-9, + atol=1e-4, ) np.testing.assert_allclose( outputs.flattened_patches[0][200][:10].numpy().tolist(), EXPECTED_FP_200, - atol=1e-9, + atol=1e-4, ) # test a batch of images and texts, right padding diff --git a/tests/models/ovis2/test_processor_ovis2.py b/tests/models/ovis2/test_processing_ovis2.py similarity index 100% rename from tests/models/ovis2/test_processor_ovis2.py rename to tests/models/ovis2/test_processing_ovis2.py diff --git a/tests/models/sam2/test_processor_sam2.py b/tests/models/sam2/test_processing_sam2.py similarity index 100% rename from tests/models/sam2/test_processor_sam2.py rename to tests/models/sam2/test_processing_sam2.py diff --git a/tests/models/sam2_video/test_processor_sam2_video.py b/tests/models/sam2_video/test_processing_sam2_video.py similarity index 100% rename from tests/models/sam2_video/test_processor_sam2_video.py rename to tests/models/sam2_video/test_processing_sam2_video.py diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 295ee03a769e..ef7b7e8a157b 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -26,7 +26,11 @@ from parameterized import parameterized from transformers.models.auto.processing_auto import processor_class_from_name -from transformers.processing_utils import Unpack +from transformers.processing_utils import ( + MODALITY_TO_AUTOPROCESSOR_MAPPING, + SPECIAL_MODULE_TO_MODEL_NAME_MAPPING, + Unpack, +) from transformers.testing_utils import ( check_json_file_has_correct_format, require_av, @@ -64,7 +68,6 @@ ], } - for modality, urls in MODALITY_INPUT_DATA.items(): MODALITY_INPUT_DATA[modality] = [url_to_local_path(url) for url in urls] @@ -106,13 +109,30 @@ def prepare_processor_dict(): def get_component(self, attribute, **kwargs): assert attribute in self.processor_class.attributes - component_class_name = getattr(self.processor_class, f"{attribute}_class") + # determine from current file name + if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute: + attribute = "tokenizer" + model_name_lowercase = self.__class__.__module__.split(".")[-1].replace("test_processing_", "").split(".")[0] + component_class_name = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute].get(model_name_lowercase, None) + if component_class_name is None: + component_class_name = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute].get( + model_name_lowercase.replace("_", "-"), None + ) + if component_class_name is None: + component_class_name = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute].get( + SPECIAL_MODULE_TO_MODEL_NAME_MAPPING.get(model_name_lowercase, None), None + ) + if component_class_name is None: + raise ValueError(f"Could not find component class name for {attribute} and {model_name_lowercase}") if isinstance(component_class_name, tuple): if attribute == "image_processor": - # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default) - component_class_name = component_class_name[0] + component_class_name = ( + component_class_name[-1] if component_class_name[-1] else component_class_name[0] + ) else: - component_class_name = component_class_name[-1] + component_class_name = ( + component_class_name[-1] if isinstance(component_class_name, tuple) else component_class_name + ) component_class = processor_class_from_name(component_class_name) component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa