From a94c537704edd778441da66dab5778fb1858544b Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 10:37:54 +0100
Subject: [PATCH 001/162] Adapt FE methods to transforms library

---
 src/transformers/__init__.py                  |   2 +
 src/transformers/image_transforms.py          | 130 ++++++++++++++++++
 src/transformers/image_utils.py               |  71 +++++++++-
 .../utils/dummy_vision_objects.py             |   8 ++
 4 files changed, 208 insertions(+), 3 deletions(-)
 create mode 100644 src/transformers/image_transforms.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8f4e4840f9df40..dcacc328e8a8a3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -632,6 +632,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["image_transforms"] = ["resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
     _import_structure["models.clip"].append("CLIPFeatureExtractor")
@@ -3339,6 +3340,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
+        from .image_transforms import resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor, CLIPProcessor
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
new file mode 100644
index 00000000000000..7585aff7643265
--- /dev/null
+++ b/src/transformers/image_transforms.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, List, Union
+
+import PIL
+import numpy as np
+
+from .image_utils import (
+    ChannelDimension,
+    get_image_size,
+    infer_channel_dimension,
+    is_torch_tensor,
+    is_tf_tensor,
+    is_jax_tensor
+)
+
+
+def to_pil_image(
+    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
+    channel_dim: Optional[ChannelDimension] = None,
+    rescale=None
+) -> PIL.Image.Image:
+    """
+    Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+    needed.
+
+    Args:
+        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+            The image to convert to the PIL Image format.
+        rescale (`bool`, *optional*):
+            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
+            to `True` if the image type is a floating type, `False` otherwise.
+    """
+    if isinstance(image, PIL.Image.Image):
+        return image
+
+    if is_torch_tensor(image) or is_tf_tensor(image):
+        image = image.numpy()
+    elif is_jax_tensor(image):
+        image = np.array(image)
+
+    if not isinstance(image, np.ndarray):
+        raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor")
+
+    # If the channel as been moved to first dim, we put it back at the end.
+    channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim
+    if channel_dim == ChannelDimension.CHANNEL_FIRST:
+        image = image.transpose((1, 2, 0))
+
+    # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
+    rescale = isinstance(image.flat[0], float) if rescale is None else rescale
+    if rescale:
+        rescale = image * 255
+    image = image.astype(np.uint8)
+    return PIL.Image.fromarray(image)
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    default_to_square: bool = True,
+    max_size: int = None
+) -> np.ndarray:
+    if isinstance(size, (tuple, list)):
+        if len(size) == 2:
+            return size
+        elif len(size) == 1:
+            # Perform same logic as if size was an int
+            size = size[0]
+        else:
+            raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
+
+    if default_to_square:
+        return (size, size)
+
+    height, width = get_image_size(input_image)
+    short, long = (width, height) if width <= height else (height, width)
+    requested_new_short = size
+
+    if short == requested_new_short:
+        return (height, width)
+
+    new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+
+    if max_size is not None:
+        if max_size <= requested_new_short:
+            raise ValueError(
+                f"max_size = {max_size} must be strictly greater than the requested "
+                f"size for the smaller edge size = {size}"
+            )
+        if new_long > max_size:
+            new_short, new_long = int(max_size * new_short / new_long), max_size
+
+    return (new_short, new_long) if width <= height else (new_long, new_short)
+
+
+def resize(image, size: Tuple[int, int], resample=PIL.Image.BILINEAR):
+    """
+    Resizes `image`. Enforces conversion of input to PIL.Image.
+
+    Args:
+        image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+            The image to resize.
+        size (`Tuple[int, int]`):
+            The size to use for resizing the image.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            The filter to user for resampling.
+
+    Returns:
+        image: A resized np.ndarray.
+    """
+    # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
+    # the pillow library to resize the image and then convert back to numpy
+    if not isinstance(image, PIL.Image.Image):
+        image = to_pil_image(image)
+    resized_image = image.resize(size, resample=resample)
+    return resized_image.numpy()
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index ddef7a3a777e93..e23321be478040 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -13,8 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import enum
 import os
-from typing import List, Union
+from typing import List, Tuple, Union
 
 import numpy as np
 import PIL.Image
@@ -22,8 +23,8 @@
 
 import requests
 
-from .utils import is_torch_available
-from .utils.generic import _is_torch
+from .utils import is_torch_available, is_tf_available, is_jax_available
+from .utils.generic import _is_torch, _is_tensorflow, _is_jax
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -40,6 +41,70 @@ def is_torch_tensor(obj):
     return _is_torch(obj) if is_torch_available() else False
 
 
+def is_tf_tensor(obj):
+    return _is_tensorflow(obj) if is_tf_available() else False
+
+
+def is_jax_tensor(obj):
+    return _is_jax(obj) if is_jax_available() else False
+
+
+class ChannelDimension(enum.Enum):
+    FIRST = 1
+    LAST = 3
+
+
+def infer_channel_dimension(image: np.ndarray) -> ChannelDimension:
+    """
+    Infers the channel dimension of the image.
+
+    Args:
+        image (`np.ndarray`):
+            The image to infer the channel dimension of.
+
+    Returns:
+        The channel dimension of the image.
+    """
+    if image.ndim == 3:
+        first_dim = 0
+        last_dim = 2
+    elif image.ndim == 4:
+        first_dim = 1
+        last_dim = 3
+    else:
+        raise ValueError(f"Unsupported image dimension: {image.ndim}")
+
+    if image.shape[first_dim] in (1, 3):
+        return ChannelDimension.FIRST
+    elif image.shape[last_dim] in (1, 3):
+        return ChannelDimension.LAST
+    raise Exception("Could not infer channel dimension")
+
+
+def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
+    """
+    Returns the (height, width) dimensions of the image.
+
+    Args:
+        image (`np.ndarray`):
+            The image to get the dimensions of.
+        channel_dim (`ChannelDimension`, *optional*):
+            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
+
+    Returns:
+        A tuple of the image's height and width.
+    """
+    if channel_dim is None:
+        channel_dim = infer_channel_dimension(image)
+
+    if channel_dim == ChannelDimension.FIRST:
+        return image.shape[-2], image.shape[-1]
+    elif channel_dim == ChannelDimension.LAST:
+        return image.shape[-3], image.shape[-2]
+    else:
+        raise ValueError(f"Unsupported data format: {channel_dim}")
+
+
 def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
     """
     Loads `image` to a PIL Image.
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index e5d2bced9e0415..0d9da1b7a16d42 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,6 +3,14 @@
 from ..utils import DummyObject, requires_backends
 
 
+def resize(*args, **kwargs):
+    requires_backends(resize, ["vision"])
+
+
+def to_pil_image(*args, **kwargs):
+    requires_backends(to_pil_image, ["vision"])
+
+
 class ImageFeatureExtractionMixin(metaclass=DummyObject):
     _backends = ["vision"]
 

From 932f291fea0419125a705e4efa7aea79fcb7d73b Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 15:55:48 +0100
Subject: [PATCH 002/162] Mixin for saving the image processor

---
 src/transformers/image_processing_utils.py | 426 +++++++++++++++++++++
 src/transformers/utils/__init__.py         |   1 +
 2 files changed, 427 insertions(+)
 create mode 100644 src/transformers/image_processing_utils.py

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
new file mode 100644
index 00000000000000..e053f4f486a788
--- /dev/null
+++ b/src/transformers/image_processing_utils.py
@@ -0,0 +1,426 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import json
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+from requests import HTTPError
+
+from .dynamic_module_utils import custom_object_save
+from .utils import (
+    IMAGE_PROCESSOR_NAME,
+    PushToHubMixin,
+    logging,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    cached_path,
+    copy_func,
+    hf_bucket_url,
+    is_remote_url,
+    is_offline_mode
+)
+
+logger = logging.get_logger(__name__)
+
+
+class ImageProcessorMixin(PushToHubMixin):
+    """
+    Image processor mixin used to provide saving/loading functionality
+    """
+
+    _auto_class = None
+
+    def __init__(self, **kwargs):
+        """Set elements of `kwargs` as attributes."""
+        # Pop "processor_class" as it should be saved as private attribute
+        self._processor_class = kwargs.pop("processor_class", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ):
+        r"""
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
+        derived class of [`BaseImageProcessor`].
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a image processor file saved using the
+                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved image processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model image processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the image processor files and override the cached versions
+                if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final image processor object. If `True`, then this
+                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
+                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are image processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+
+        <Tip>
+
+        Passing `use_auth_token=True` is required when you want to use a private model.
+
+        </Tip>
+
+        Returns:
+            An image processor of type [`~image_processing_utils.ImageProcessorMixin`].
+
+        Examples: FIXME
+
+        """
+        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
+
+        return cls.from_dict(image_processor_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save an image_processor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the image processor JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your image processor to the Hugging Face model hub after saving it.
+
+                <Tip warning={true}>
+
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
+                which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
+                folder. Pass along `temp_dir=True` to use a temporary directory instead.
+
+                </Tip>
+
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo = self._create_or_get_repo(save_directory, **kwargs)
+
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+
+        os.makedirs(save_directory, exist_ok=True)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
+
+        self.to_json_file(output_image_processor_file)
+        logger.info(f"Image processor saved in {output_image_processor_file}")
+
+        if push_to_hub:
+            url = self._push_to_hub(repo, commit_message=commit_message)
+            logger.info(f"Image processor pushed to the hub in this commit: {url}")
+
+        return [output_image_processor_file]
+
+    @classmethod
+    def get_image_processor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+
+        Returns:
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            image_processor_file = pretrained_model_name_or_path
+        else:
+            image_processor_file = hf_bucket_url(
+                pretrained_model_name_or_path, filename=IMAGE_PROCESSOR_NAME, revision=revision, mirror=None
+            )
+
+        try:
+            # Load from URL or cache if already cached
+            resolved_image_processor_file = cached_path(
+                image_processor_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                user_agent=user_agent,
+            )
+
+        except RepositoryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed on "
+                "'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token having "
+                "permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass "
+                "`use_auth_token=True`."
+            )
+        except RevisionNotFoundError:
+            raise EnvironmentError(
+                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
+                f"model name. Check the model page at 'https://huggingface.co/{pretrained_model_name_or_path}' for "
+                "available revisions."
+            )
+        except EntryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} does not appear to have a file named {IMAGE_PROCESSOR_NAME}."
+            )
+        except HTTPError as err:
+            raise EnvironmentError(
+                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
+            )
+        except ValueError:
+            raise EnvironmentError(
+                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it in"
+                f" the cached files and it looks like {pretrained_model_name_or_path} is not the path to a directory"
+                f" containing a {IMAGE_PROCESSOR_NAME} file.\nCheckout your internet connection or see how to run"
+                " the library in offline mode at"
+                " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
+            )
+        except EnvironmentError:
+            raise EnvironmentError(
+                f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load it "
+                "from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                f"containing a {IMAGE_PROCESSOR_NAME} file"
+            )
+
+        try:
+            # Load image_processor dict
+            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
+            )
+
+        if resolved_image_processor_file == image_processor_file:
+            logger.info(f"loading image processor configuration file {image_processor_file}")
+        else:
+            logger.info(
+                f"loading image processor configuration file {image_processor_file} from cache at"
+                f" {resolved_image_processor_file}"
+            )
+
+        return image_processor_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of
+        parameters.
+
+        Args:
+            image_processor_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the image processor object.
+
+        Returns:
+            [`~feature_extraction_utils.FeatureExtractionMixin`]: The image processor object instantiated from those
+            parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        image_processor = cls(**image_processor_dict)
+
+        # Update image_processor with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(image_processor, key):
+                setattr(image_processor, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info(f"image processor {image_processor}")
+        if return_unused_kwargs:
+            return image_processor, kwargs
+        else:
+            return image_processor
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["image_processor_type"] = self.__class__.__name__
+
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]):
+        """
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
+        a JSON file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
+            object instantiated from that JSON file.
+        """
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        image_processor_dict = json.loads(text)
+        return cls(**image_processor_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this image_processor instance in JSON format.
+        """
+        dictionary = self.to_dict()
+
+        for key, value in dictionary.items():
+            if isinstance(value, np.ndarray):
+                dictionary[key] = value.tolist()
+
+        # make sure private name "_processor_class" is correctly
+        # saved as "processor_class"
+        _processor_class = dictionary.pop("_processor_class", None)
+        if _processor_class is not None:
+            dictionary["processor_class"] = _processor_class
+
+        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this image_processor instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
+        """
+        Register this class with a given auto class. This should only be used for custom image processors as the ones
+        in the library are already mapped with `AutoImageProcessor`.
+
+        <Tip warning={true}>
+
+        This API is experimental and may have some slight breaking changes in the next releases.
+
+        </Tip>
+
+        Args:
+            auto_class (`str` or `type`, *optional*, defaults to `"AutoFeatureExtractor"`):
+                The auto class to register this new image processor with.
+        """
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+
+        import transformers.models.auto as auto_module
+
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+
+        cls._auto_class = auto_class
+
+
+ImageProcessorMixin.push_to_hub = copy_func(ImageProcessorMixin.push_to_hub)
+ImageProcessorMixin.push_to_hub.__doc__ = ImageProcessorMixin.push_to_hub.__doc__.format(
+    object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
+)
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 1ee4521514afb9..530f456a90085a 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -156,6 +156,7 @@
 FLAX_WEIGHTS_INDEX_NAME = "flax_model.msgpack.index.json"
 CONFIG_NAME = "config.json"
 FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
+IMAGE_PROCESSOR_NAME = "image_processor_config.json"
 MODEL_CARD_NAME = "modelcard.json"
 
 SENTENCEPIECE_UNDERLINE = "▁"

From 54aed8b374dd0766abe0dc762d4bbe543afdcb4e Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 16:12:56 +0100
Subject: [PATCH 003/162] Base processor skeleton

---
 src/transformers/image_processing_utils.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index e053f4f486a788..908216cd463435 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -424,3 +424,14 @@ def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
 ImageProcessorMixin.push_to_hub.__doc__ = ImageProcessorMixin.push_to_hub.__doc__.format(
     object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
 )
+
+
+class BaseImageProcessor(ImageProcessorMixin):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.preprocess(*args, **kwargs)
+
+    def preprocess(self, *args, **kwargs):
+        raise NotImplementedError("Each image processor must implement its own preprocess method")

From ba55c8996ae83e3f066ac9cd9e557e9ad66e1a74 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:30:59 +0100
Subject: [PATCH 004/162] BatchFeature for packaging image processor outputs

---
 src/transformers/image_processing_utils.py | 164 ++++++++++++++++++++-
 1 file changed, 159 insertions(+), 5 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 908216cd463435..32477c7f4f777a 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -16,30 +16,184 @@
 import copy
 import os
 import json
-from typing import Any, Dict, Tuple, Union
+from collections import UserDict
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
 from .utils import (
-    IMAGE_PROCESSOR_NAME,
-    PushToHubMixin,
-    logging,
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    IMAGE_PROCESSOR_NAME,
     EntryNotFoundError,
+    PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
+    is_flax_available,
     is_remote_url,
-    is_offline_mode
+    is_torch_available,
+    is_tf_available,
+    is_offline_mode,
+    logging,
+    torch_required,
 )
+from .utils.generic import _is_jax, _is_numpy, _is_torch_device
 
 logger = logging.get_logger(__name__)
 
 
+class BatchFeature(UserDict):
+    r"""
+    Holds the output of the image processor specific `__call__` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
+            etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
+    """
+
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__
+    def __getitem__(self, item: str) -> Any:
+        """
+        If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask',
+        etc.).
+        """
+        if isinstance(item, str):
+            return self.data[item]
+        else:
+            raise KeyError("Indexing with integers is not available when using Python based feature extractors")
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__
+    def __getattr__(self, item: str):
+        try:
+            return self.data[item]
+        except KeyError:
+            raise AttributeError
+
+    # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__
+    def __getstate__(self):
+        return {"data": self.data}
+
+    # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__
+    def __setstate__(self, state):
+        if "data" in state:
+            self.data = state["data"]
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.keys
+    def keys(self):
+        return self.data.keys()
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.values
+    def values(self):
+        return self.data.values()
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.items
+    def items(self):
+        return self.data.items()
+
+    # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (`str` or [`~utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+                `None`, no modification is done.
+        """
+        if tensor_type is None:
+            return self
+
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.TENSORFLOW:
+            if not is_tf_available():
+                raise ImportError(
+                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
+                )
+            import tensorflow as tf
+
+            as_tensor = tf.constant
+            is_tensor = tf.is_tensor
+        elif tensor_type == TensorType.PYTORCH:
+            if not is_torch_available():
+                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+            import torch
+
+            def as_tensor(value):
+                if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray):
+                    value = np.array(value)
+                return torch.tensor(value)
+
+            is_tensor = torch.is_tensor
+        elif tensor_type == TensorType.JAX:
+            if not is_flax_available():
+                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
+            import jax.numpy as jnp  # noqa: F811
+
+            as_tensor = jnp.array
+            is_tensor = _is_jax
+        else:
+            as_tensor = np.asarray
+            is_tensor = _is_numpy
+
+        # Do the tensor conversion in batch
+        for key, value in self.items():
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+
+                    self[key] = tensor
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+
+        return self
+
+    @torch_required
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
+    def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
+        """
+        Send all values to device by calling `v.to(device)` (PyTorch only).
+
+        Args:
+            device (`str` or `torch.device`): The device to put the tensors on.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+
+        # This check catches things like APEX blindly calling "to" on all inputs to a module
+        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
+        # into a HalfTensor
+        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
+            self.data = {k: v.to(device=device) for k, v in self.data.items()}
+        else:
+            logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.")
+        return self
+
+
 class ImageProcessorMixin(PushToHubMixin):
     """
     Image processor mixin used to provide saving/loading functionality

From 4b430d4de00b1ddb21882eaddce5e2f5c61842f7 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 19:38:50 +0100
Subject: [PATCH 005/162] Initial image processor for GLPN

---
 src/transformers/image_processing_utils.py    | 24 +++---
 src/transformers/image_transforms.py          | 26 +++++--
 src/transformers/image_utils.py               | 61 +++++++++++++--
 .../models/glpn/image_processing_glpn.py      | 76 +++++++++++++++++++
 4 files changed, 165 insertions(+), 22 deletions(-)
 create mode 100644 src/transformers/models/glpn/image_processing_glpn.py

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 32477c7f4f777a..2f1377b3773f0f 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -23,6 +23,7 @@
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
+from .image_utils import ImageType
 from .utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     IMAGE_PROCESSOR_NAME,
@@ -105,25 +106,24 @@ def values(self):
     def items(self):
         return self.data.items()
 
-    # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, ImageType]] = None):
         """
         Convert the inner content to tensors.
 
         Args:
-            tensor_type (`str` or [`~utils.TensorType`], *optional*):
-                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+            tensor_type (`str` or [`~utils.ImageType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.ImageType`]. If
                 `None`, no modification is done.
         """
         if tensor_type is None:
             return self
 
         # Convert to TensorType
-        if not isinstance(tensor_type, TensorType):
-            tensor_type = TensorType(tensor_type)
+        if not isinstance(tensor_type, ImageType):
+            tensor_type = ImageType(tensor_type)
 
         # Get a function reference for the correct framework
-        if tensor_type == TensorType.TENSORFLOW:
+        if tensor_type == ImageType.TENSORFLOW:
             if not is_tf_available():
                 raise ImportError(
                     "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
@@ -132,7 +132,7 @@ def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = Non
 
             as_tensor = tf.constant
             is_tensor = tf.is_tensor
-        elif tensor_type == TensorType.PYTORCH:
+        elif tensor_type == ImageType.PYTORCH:
             if not is_torch_available():
                 raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
             import torch
@@ -143,7 +143,7 @@ def as_tensor(value):
                 return torch.tensor(value)
 
             is_tensor = torch.is_tensor
-        elif tensor_type == TensorType.JAX:
+        elif tensor_type == ImageType.JAX:
             if not is_flax_available():
                 raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
             import jax.numpy as jnp  # noqa: F811
@@ -584,8 +584,8 @@ class BaseImageProcessor(ImageProcessorMixin):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def __call__(self, *args, **kwargs):
-        return self.preprocess(*args, **kwargs)
+    def __call__(self, images, **kwargs) -> BatchFeature:
+        return self.preprocess(images, **kwargs)
 
-    def preprocess(self, *args, **kwargs):
+    def preprocess(self, images, **kwargs) -> BatchFeature:
         raise NotImplementedError("Each image processor must implement its own preprocess method")
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7585aff7643265..55d788d4753175 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -28,10 +28,26 @@
 )
 
 
+def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray``):
+            The image to rescale.
+        scale (`float`, `int`):
+            The scale to use for rescaling the image.
+
+    Returns:
+        image: A rescaled np.ndarray image.
+    """
+    return image * scale
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None
+    do_rescale: Optional[bool] = None
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -61,9 +77,9 @@ def to_pil_image(
         image = image.transpose((1, 2, 0))
 
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
-    rescale = isinstance(image.flat[0], float) if rescale is None else rescale
-    if rescale:
-        rescale = image * 255
+    do_rescale = isinstance(image.flat[0], float) if do_rescale is None else do_rescale
+    if do_rescale:
+        image = rescale(image, 255)
     image = image.astype(np.uint8)
     return PIL.Image.fromarray(image)
 
@@ -107,7 +123,7 @@ def get_resize_output_image_size(
     return (new_short, new_long) if width <= height else (new_long, new_short)
 
 
-def resize(image, size: Tuple[int, int], resample=PIL.Image.BILINEAR):
+def resize(image, size: Tuple[int, int], resample=PIL.Image.Resampling.BILINEAR):
     """
     Resizes `image`. Enforces conversion of input to PIL.Image.
 
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e23321be478040..e5afb3122bcd03 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import _is_torch, _is_tensorflow, _is_jax
+from .utils import TensorType, is_torch_available, is_tf_available, is_jax_available
+from .utils.generic import ExplicitEnum, _is_torch, _is_tensorflow, _is_jax, _is_numpy, to_numpy
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -36,6 +36,21 @@
     PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
 ]
 
+class ChannelDimension(enum.Enum):
+    FIRST = 1
+    LAST = 3
+
+
+class ImageType(ExplicitEnum):
+    """
+    Possible image data formats that can be fed into an image processor
+    """
+    PYTORCH = "pt"
+    TENSORFLOW = "tf"
+    NUMPY = "np"
+    JAX = "jax"
+    PIL = "pillow"
+
 
 def is_torch_tensor(obj):
     return _is_torch(obj) if is_torch_available() else False
@@ -49,9 +64,45 @@ def is_jax_tensor(obj):
     return _is_jax(obj) if is_jax_available() else False
 
 
-class ChannelDimension(enum.Enum):
-    FIRST = 1
-    LAST = 3
+def is_valid_image(img):
+    return (
+        isinstance(img, (PIL.Image.Image, np.ndarray))
+        or is_torch_tensor(img)
+        or is_tf_tensor(img)
+        or is_jax_tensor(img)
+    )
+
+
+def valid_images(imgs):
+    return all(is_valid_image(img) for img in imgs)
+
+
+def is_batched(img):
+    if isinstance(img, (list, tuple)):
+        return is_valid_image(img[0])
+    return False
+
+
+def get_image_type(obj) -> TensorType:
+    if is_torch_tensor(obj):
+        return TensorType.TORCH
+    elif is_tf_tensor(obj):
+        return TensorType.TF
+    elif is_jax_tensor(obj):
+        return TensorType.JAX
+    elif _is_numpy(obj):
+        return TensorType.NUMPY
+    elif isinstance(obj, PIL.Image.Image):
+        return TensorType.PIL
+    else:
+        raise ValueError("Could not infer tensor type")
+
+
+def to_numpy_array(img) -> np.ndarray:
+    input_type = get_image_type(img)
+    if input_type == ImageType.PIL:
+        return np.array(img)
+    return to_numpy(img)
 
 
 def infer_channel_dimension(image: np.ndarray) -> ChannelDimension:
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
new file mode 100644
index 00000000000000..9942845e5e4550
--- /dev/null
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -0,0 +1,76 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for GLPN."""
+
+from tkinter import Image
+from typing import Union
+
+from numpy import np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import resize, rescale
+from ...image_utils import ImageType, is_batched, to_numpy_array, valid_images, get_image_size
+from ...utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class GLPNImageProcessor(BaseImageProcessor):
+    def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=Image.Resampling.BILINEAR, **kwargs) -> None:
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.size_divisor = size_divisor
+        self.resample = resample
+        super().__init__(**kwargs)
+
+    def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: Image.Resampling, **kwargs) -> np.ndarray:
+        height, width = get_image_size(image)
+        new_h = height // size_divisor * size_divisor
+        new_w = width // size_divisor * size_divisor
+        image = resize(image, (new_h, new_w), resample=resample, **kwargs)
+        return image
+
+    def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray:
+        return rescale(image, scale, **kwargs)
+
+    def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs) -> BatchFeature:
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        resample = resample if resample is not None else self.resample
+
+        # If a return type isn't specified, default to numpy arrays.
+        return_tensors = ImageType.NUMPY if return_tensors is None else return_tensors
+
+        if do_resize and size_divisor is None:
+            raise ValueError("size_divisor is required for resizing")
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image(s)")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(img) for img in images]
+
+        if do_resize:
+            images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(**data, return_tensors=return_tensors)

From b1c8b59fbe165d8a3f542c280fe65486eef0c2ad Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 20:02:51 +0100
Subject: [PATCH 006/162] REmove accidental import

---
 src/transformers/models/glpn/image_processing_glpn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 9942845e5e4550..bdc33e80ce9f87 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 """Image processor class for GLPN."""
 
-from tkinter import Image
 from typing import Union
 
+import PIL.Image
 from numpy import np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
@@ -28,14 +28,14 @@
 
 
 class GLPNImageProcessor(BaseImageProcessor):
-    def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=Image.Resampling.BILINEAR, **kwargs) -> None:
+    def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale
         self.size_divisor = size_divisor
         self.resample = resample
         super().__init__(**kwargs)
 
-    def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: Image.Resampling, **kwargs) -> np.ndarray:
+    def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs) -> np.ndarray:
         height, width = get_image_size(image)
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor

From daf069a22311c47ab5a32b44710707a837b77f0c Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:18:18 +0100
Subject: [PATCH 007/162] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    | 24 +++++++++
 src/transformers/image_transforms.py          | 50 ++++++++++++++++---
 src/transformers/image_utils.py               |  6 +--
 3 files changed, 70 insertions(+), 10 deletions(-)
 create mode 100644 docs/source/en/internal/image_processing_utils.mdx

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
new file mode 100644
index 00000000000000..ae4f826517aa1c
--- /dev/null
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Image Processors
+
+This page lists all the utility functions used by the image processors, mainly the functional
+transformations used to process the images.
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Image Transformations
+
+[[autodoc]] image_transforms.to_pil_image
+
+[[autodoc]] image_transforms.resize
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7585aff7643265..9112506e8ebffa 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,25 +13,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, List, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
-import PIL
 import numpy as np
+import PIL
+
+from transformers.utils.import_utils import is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
     get_image_size,
     infer_channel_dimension,
-    is_torch_tensor,
+    is_jax_tensor,
     is_tf_tensor,
-    is_jax_tensor
+    is_torch_tensor,
 )
 
 
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None
+    rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -70,10 +79,37 @@ def to_pil_image(
 
 def get_resize_output_image_size(
     input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
-    max_size: int = None
+    max_size: int = None,
 ) -> np.ndarray:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            The filter to user for resampling.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*, defaults to `None`):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+    """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
             return size
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e23321be478040..15bcf99542614c 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import _is_torch, _is_tensorflow, _is_jax
+from .utils import is_flax_available, is_tf_available, is_torch_available
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -46,7 +46,7 @@ def is_tf_tensor(obj):
 
 
 def is_jax_tensor(obj):
-    return _is_jax(obj) if is_jax_available() else False
+    return _is_jax(obj) if is_flax_available() else False
 
 
 class ChannelDimension(enum.Enum):

From 95b4a6af8d0a7a6639326904b3c24621a904fda5 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 15:55:48 +0100
Subject: [PATCH 008/162] Mixin for saving the image processor

---
 src/transformers/image_processing_utils.py | 426 +++++++++++++++++++++
 src/transformers/utils/__init__.py         |   1 +
 2 files changed, 427 insertions(+)
 create mode 100644 src/transformers/image_processing_utils.py

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
new file mode 100644
index 00000000000000..e053f4f486a788
--- /dev/null
+++ b/src/transformers/image_processing_utils.py
@@ -0,0 +1,426 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import json
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+from requests import HTTPError
+
+from .dynamic_module_utils import custom_object_save
+from .utils import (
+    IMAGE_PROCESSOR_NAME,
+    PushToHubMixin,
+    logging,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    cached_path,
+    copy_func,
+    hf_bucket_url,
+    is_remote_url,
+    is_offline_mode
+)
+
+logger = logging.get_logger(__name__)
+
+
+class ImageProcessorMixin(PushToHubMixin):
+    """
+    Image processor mixin used to provide saving/loading functionality
+    """
+
+    _auto_class = None
+
+    def __init__(self, **kwargs):
+        """Set elements of `kwargs` as attributes."""
+        # Pop "processor_class" as it should be saved as private attribute
+        self._processor_class = kwargs.pop("processor_class", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ):
+        r"""
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
+        derived class of [`BaseImageProcessor`].
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a image processor file saved using the
+                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved image processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model image processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the image processor files and override the cached versions
+                if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final image processor object. If `True`, then this
+                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
+                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are image processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+
+        <Tip>
+
+        Passing `use_auth_token=True` is required when you want to use a private model.
+
+        </Tip>
+
+        Returns:
+            An image processor of type [`~image_processing_utils.ImageProcessorMixin`].
+
+        Examples: FIXME
+
+        """
+        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
+
+        return cls.from_dict(image_processor_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save an image_processor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the image processor JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your image processor to the Hugging Face model hub after saving it.
+
+                <Tip warning={true}>
+
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
+                which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
+                folder. Pass along `temp_dir=True` to use a temporary directory instead.
+
+                </Tip>
+
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo = self._create_or_get_repo(save_directory, **kwargs)
+
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+
+        os.makedirs(save_directory, exist_ok=True)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
+
+        self.to_json_file(output_image_processor_file)
+        logger.info(f"Image processor saved in {output_image_processor_file}")
+
+        if push_to_hub:
+            url = self._push_to_hub(repo, commit_message=commit_message)
+            logger.info(f"Image processor pushed to the hub in this commit: {url}")
+
+        return [output_image_processor_file]
+
+    @classmethod
+    def get_image_processor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+
+        Returns:
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            image_processor_file = pretrained_model_name_or_path
+        else:
+            image_processor_file = hf_bucket_url(
+                pretrained_model_name_or_path, filename=IMAGE_PROCESSOR_NAME, revision=revision, mirror=None
+            )
+
+        try:
+            # Load from URL or cache if already cached
+            resolved_image_processor_file = cached_path(
+                image_processor_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                user_agent=user_agent,
+            )
+
+        except RepositoryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed on "
+                "'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token having "
+                "permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass "
+                "`use_auth_token=True`."
+            )
+        except RevisionNotFoundError:
+            raise EnvironmentError(
+                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
+                f"model name. Check the model page at 'https://huggingface.co/{pretrained_model_name_or_path}' for "
+                "available revisions."
+            )
+        except EntryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} does not appear to have a file named {IMAGE_PROCESSOR_NAME}."
+            )
+        except HTTPError as err:
+            raise EnvironmentError(
+                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
+            )
+        except ValueError:
+            raise EnvironmentError(
+                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it in"
+                f" the cached files and it looks like {pretrained_model_name_or_path} is not the path to a directory"
+                f" containing a {IMAGE_PROCESSOR_NAME} file.\nCheckout your internet connection or see how to run"
+                " the library in offline mode at"
+                " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
+            )
+        except EnvironmentError:
+            raise EnvironmentError(
+                f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load it "
+                "from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                f"containing a {IMAGE_PROCESSOR_NAME} file"
+            )
+
+        try:
+            # Load image_processor dict
+            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
+            )
+
+        if resolved_image_processor_file == image_processor_file:
+            logger.info(f"loading image processor configuration file {image_processor_file}")
+        else:
+            logger.info(
+                f"loading image processor configuration file {image_processor_file} from cache at"
+                f" {resolved_image_processor_file}"
+            )
+
+        return image_processor_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of
+        parameters.
+
+        Args:
+            image_processor_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the image processor object.
+
+        Returns:
+            [`~feature_extraction_utils.FeatureExtractionMixin`]: The image processor object instantiated from those
+            parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        image_processor = cls(**image_processor_dict)
+
+        # Update image_processor with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(image_processor, key):
+                setattr(image_processor, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info(f"image processor {image_processor}")
+        if return_unused_kwargs:
+            return image_processor, kwargs
+        else:
+            return image_processor
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["image_processor_type"] = self.__class__.__name__
+
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]):
+        """
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
+        a JSON file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
+            object instantiated from that JSON file.
+        """
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        image_processor_dict = json.loads(text)
+        return cls(**image_processor_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this image_processor instance in JSON format.
+        """
+        dictionary = self.to_dict()
+
+        for key, value in dictionary.items():
+            if isinstance(value, np.ndarray):
+                dictionary[key] = value.tolist()
+
+        # make sure private name "_processor_class" is correctly
+        # saved as "processor_class"
+        _processor_class = dictionary.pop("_processor_class", None)
+        if _processor_class is not None:
+            dictionary["processor_class"] = _processor_class
+
+        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this image_processor instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
+        """
+        Register this class with a given auto class. This should only be used for custom image processors as the ones
+        in the library are already mapped with `AutoImageProcessor`.
+
+        <Tip warning={true}>
+
+        This API is experimental and may have some slight breaking changes in the next releases.
+
+        </Tip>
+
+        Args:
+            auto_class (`str` or `type`, *optional*, defaults to `"AutoFeatureExtractor"`):
+                The auto class to register this new image processor with.
+        """
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+
+        import transformers.models.auto as auto_module
+
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+
+        cls._auto_class = auto_class
+
+
+ImageProcessorMixin.push_to_hub = copy_func(ImageProcessorMixin.push_to_hub)
+ImageProcessorMixin.push_to_hub.__doc__ = ImageProcessorMixin.push_to_hub.__doc__.format(
+    object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
+)
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 1ee4521514afb9..530f456a90085a 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -156,6 +156,7 @@
 FLAX_WEIGHTS_INDEX_NAME = "flax_model.msgpack.index.json"
 CONFIG_NAME = "config.json"
 FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
+IMAGE_PROCESSOR_NAME = "image_processor_config.json"
 MODEL_CARD_NAME = "modelcard.json"
 
 SENTENCEPIECE_UNDERLINE = "▁"

From 6f7ef5621e057f4d99ca15872d911931417d9227 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 009/162] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    |  5 +++
 src/transformers/__init__.py                  |  2 ++
 src/transformers/image_processing_utils.py    | 32 +++++++++----------
 .../utils/dummy_vision_objects.py             |  7 ++++
 4 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index ae4f826517aa1c..4d5831a12fd623 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo
 [[autodoc]] image_transforms.to_pil_image
 
 [[autodoc]] image_transforms.resize
+
+
+## ImageProcessorMixin
+
+[[autodoc]] image_processing_utils.ImageProcessorMixin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dcacc328e8a8a3..a5c3e0d9057515 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -632,6 +632,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
     _import_structure["image_transforms"] = ["resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
@@ -3340,6 +3341,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
+        from .image_processing_utils import ImageProcessorMixin
         from .image_transforms import resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index e053f4f486a788..7e0a0f3a7f9aa8 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -14,29 +14,31 @@
 # limitations under the License.
 
 import copy
-import os
 import json
+import os
 from typing import Any, Dict, Tuple, Union
 
 import numpy as np
+
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
 from .utils import (
-    IMAGE_PROCESSOR_NAME,
-    PushToHubMixin,
-    logging,
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    IMAGE_PROCESSOR_NAME,
     EntryNotFoundError,
+    PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
     cached_path,
     copy_func,
     hf_bucket_url,
+    is_offline_mode,
     is_remote_url,
-    is_offline_mode
+    logging,
 )
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -64,12 +66,10 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
-        derived class of [`BaseImageProcessor`].
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived
+        class of [`BaseImageProcessor`].
 
         Args:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -87,8 +87,8 @@ def from_pretrained(
                 Path to a directory in which a downloaded pretrained model image processor should be cached if the
                 standard cache should not be used.
             force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions
-                if they exist.
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
             resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
@@ -342,16 +342,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 0d9da1b7a16d42..5756f67326b7ad 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class ImageProcessorMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From b9ce4a00399e5d177c1db07bab8ba47595abd12f Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:59:34 +0100
Subject: [PATCH 010/162] Import BatchFeature from feature_extraction_utils

---
 src/transformers/image_processing_utils.py | 173 +++------------------
 src/transformers/image_transforms.py       |  29 ++--
 src/transformers/image_utils.py            |   6 +-
 3 files changed, 42 insertions(+), 166 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 32477c7f4f777a..0e8b02c56b52d5 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -14,15 +14,16 @@
 # limitations under the License.
 
 import copy
-import os
 import json
-from collections import UserDict
-from typing import Any, Dict, Optional, Tuple, Union
+import os
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
+
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     IMAGE_PROCESSOR_NAME,
@@ -30,24 +31,21 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_torch_available,
-    is_tf_available,
     is_offline_mode,
+    is_remote_url,
     logging,
-    torch_required,
 )
-from .utils.generic import _is_jax, _is_numpy, _is_torch_device
+
 
 logger = logging.get_logger(__name__)
 
 
-class BatchFeature(UserDict):
+# TODO: Move BatchFeature to be imported by both feature_extraction_utils and image_processing_utils
+# We override the class string here, but logic is the same.
+class BatchFeature(BaseBatchFeature):
     r"""
     Holds the output of the image processor specific `__call__` methods.
 
@@ -55,144 +53,13 @@ class BatchFeature(UserDict):
 
     Args:
         data (`dict`):
-            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask',
             etc.).
         tensor_type (`Union[None, str, TensorType]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
             initialization.
     """
 
-    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
-        super().__init__(data)
-        self.convert_to_tensors(tensor_type=tensor_type)
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__
-    def __getitem__(self, item: str) -> Any:
-        """
-        If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask',
-        etc.).
-        """
-        if isinstance(item, str):
-            return self.data[item]
-        else:
-            raise KeyError("Indexing with integers is not available when using Python based feature extractors")
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__
-    def __getattr__(self, item: str):
-        try:
-            return self.data[item]
-        except KeyError:
-            raise AttributeError
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__
-    def __getstate__(self):
-        return {"data": self.data}
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__
-    def __setstate__(self, state):
-        if "data" in state:
-            self.data = state["data"]
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.keys
-    def keys(self):
-        return self.data.keys()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.values
-    def values(self):
-        return self.data.values()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.items
-    def items(self):
-        return self.data.items()
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
-        """
-        Convert the inner content to tensors.
-
-        Args:
-            tensor_type (`str` or [`~utils.TensorType`], *optional*):
-                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
-                `None`, no modification is done.
-        """
-        if tensor_type is None:
-            return self
-
-        # Convert to TensorType
-        if not isinstance(tensor_type, TensorType):
-            tensor_type = TensorType(tensor_type)
-
-        # Get a function reference for the correct framework
-        if tensor_type == TensorType.TENSORFLOW:
-            if not is_tf_available():
-                raise ImportError(
-                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
-                )
-            import tensorflow as tf
-
-            as_tensor = tf.constant
-            is_tensor = tf.is_tensor
-        elif tensor_type == TensorType.PYTORCH:
-            if not is_torch_available():
-                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
-            import torch
-
-            def as_tensor(value):
-                if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray):
-                    value = np.array(value)
-                return torch.tensor(value)
-
-            is_tensor = torch.is_tensor
-        elif tensor_type == TensorType.JAX:
-            if not is_flax_available():
-                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
-            import jax.numpy as jnp  # noqa: F811
-
-            as_tensor = jnp.array
-            is_tensor = _is_jax
-        else:
-            as_tensor = np.asarray
-            is_tensor = _is_numpy
-
-        # Do the tensor conversion in batch
-        for key, value in self.items():
-            try:
-                if not is_tensor(value):
-                    tensor = as_tensor(value)
-
-                    self[key] = tensor
-            except:  # noqa E722
-                if key == "overflowing_values":
-                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
-                raise ValueError(
-                    "Unable to create tensor, you should probably activate padding "
-                    "with 'padding=True' to have batched tensors with the same length."
-                )
-
-        return self
-
-    @torch_required
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
-    def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
-        """
-        Send all values to device by calling `v.to(device)` (PyTorch only).
-
-        Args:
-            device (`str` or `torch.device`): The device to put the tensors on.
-
-        Returns:
-            [`BatchFeature`]: The same instance after modification.
-        """
-
-        # This check catches things like APEX blindly calling "to" on all inputs to a module
-        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
-        # into a HalfTensor
-        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items()}
-        else:
-            logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.")
-        return self
-
 
 class ImageProcessorMixin(PushToHubMixin):
     """
@@ -218,12 +85,10 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
-        derived class of [`BaseImageProcessor`].
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived
+        class of [`BaseImageProcessor`].
 
         Args:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -241,8 +106,8 @@ def from_pretrained(
                 Path to a directory in which a downloaded pretrained model image processor should be cached if the
                 standard cache should not be used.
             force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions
-                if they exist.
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
             resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
@@ -496,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7585aff7643265..7551a431b4ba79 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,32 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, List, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
-import PIL
 import numpy as np
+import PIL
+
+from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
     get_image_size,
     infer_channel_dimension,
-    is_torch_tensor,
+    is_jax_tensor,
     is_tf_tensor,
-    is_jax_tensor
+    is_torch_tensor,
 )
 
 
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+    if is_flax_available():
+        import jax.numpy as jnp
+
+
 def to_pil_image(
-    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
+    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None
+    rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
     needed.
 
     Args:
-        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+        image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`):
             The image to convert to the PIL Image format.
         rescale (`bool`, *optional*):
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
@@ -53,7 +64,7 @@ def to_pil_image(
         image = np.array(image)
 
     if not isinstance(image, np.ndarray):
-        raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor")
+        raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
     channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim
@@ -72,7 +83,7 @@ def get_resize_output_image_size(
     input_image: np.ndarray,
     size: Union[int, Tuple[int, int], List[int]],
     default_to_square: bool = True,
-    max_size: int = None
+    max_size: int = None,
 ) -> np.ndarray:
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e23321be478040..15bcf99542614c 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import _is_torch, _is_tensorflow, _is_jax
+from .utils import is_flax_available, is_tf_available, is_torch_available
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -46,7 +46,7 @@ def is_tf_tensor(obj):
 
 
 def is_jax_tensor(obj):
-    return _is_jax(obj) if is_jax_available() else False
+    return _is_jax(obj) if is_flax_available() else False
 
 
 class ChannelDimension(enum.Enum):

From 6b678fb4397a5dc2ddffcf93271dd7c534a964ca Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:18:18 +0100
Subject: [PATCH 011/162] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    | 24 +++++++++
 src/transformers/image_transforms.py          | 50 ++++++++++++++++---
 src/transformers/image_utils.py               |  6 +--
 3 files changed, 70 insertions(+), 10 deletions(-)
 create mode 100644 docs/source/en/internal/image_processing_utils.mdx

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
new file mode 100644
index 00000000000000..ae4f826517aa1c
--- /dev/null
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Image Processors
+
+This page lists all the utility functions used by the image processors, mainly the functional
+transformations used to process the images.
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Image Transformations
+
+[[autodoc]] image_transforms.to_pil_image
+
+[[autodoc]] image_transforms.resize
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7585aff7643265..9112506e8ebffa 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,25 +13,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, List, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
-import PIL
 import numpy as np
+import PIL
+
+from transformers.utils.import_utils import is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
     get_image_size,
     infer_channel_dimension,
-    is_torch_tensor,
+    is_jax_tensor,
     is_tf_tensor,
-    is_jax_tensor
+    is_torch_tensor,
 )
 
 
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None
+    rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -70,10 +79,37 @@ def to_pil_image(
 
 def get_resize_output_image_size(
     input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
-    max_size: int = None
+    max_size: int = None,
 ) -> np.ndarray:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            The filter to user for resampling.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*, defaults to `None`):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+    """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
             return size
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e23321be478040..15bcf99542614c 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import _is_torch, _is_tensorflow, _is_jax
+from .utils import is_flax_available, is_tf_available, is_torch_available
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -46,7 +46,7 @@ def is_tf_tensor(obj):
 
 
 def is_jax_tensor(obj):
-    return _is_jax(obj) if is_jax_available() else False
+    return _is_jax(obj) if is_flax_available() else False
 
 
 class ChannelDimension(enum.Enum):

From db9343777571f0ee77307646755cab377ea6d388 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 012/162] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    |  5 +++
 src/transformers/__init__.py                  |  2 ++
 src/transformers/image_processing_utils.py    | 32 +++++++++----------
 .../utils/dummy_vision_objects.py             |  7 ++++
 4 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index ae4f826517aa1c..4d5831a12fd623 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo
 [[autodoc]] image_transforms.to_pil_image
 
 [[autodoc]] image_transforms.resize
+
+
+## ImageProcessorMixin
+
+[[autodoc]] image_processing_utils.ImageProcessorMixin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dcacc328e8a8a3..a5c3e0d9057515 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -632,6 +632,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
     _import_structure["image_transforms"] = ["resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
@@ -3340,6 +3341,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
+        from .image_processing_utils import ImageProcessorMixin
         from .image_transforms import resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 908216cd463435..0ec4a429e1a552 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -14,29 +14,31 @@
 # limitations under the License.
 
 import copy
-import os
 import json
+import os
 from typing import Any, Dict, Tuple, Union
 
 import numpy as np
+
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
 from .utils import (
-    IMAGE_PROCESSOR_NAME,
-    PushToHubMixin,
-    logging,
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    IMAGE_PROCESSOR_NAME,
     EntryNotFoundError,
+    PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
     cached_path,
     copy_func,
     hf_bucket_url,
+    is_offline_mode,
     is_remote_url,
-    is_offline_mode
+    logging,
 )
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -64,12 +66,10 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
-        derived class of [`BaseImageProcessor`].
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived
+        class of [`BaseImageProcessor`].
 
         Args:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -87,8 +87,8 @@ def from_pretrained(
                 Path to a directory in which a downloaded pretrained model image processor should be cached if the
                 standard cache should not be used.
             force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions
-                if they exist.
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
             resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
@@ -342,16 +342,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 0d9da1b7a16d42..5756f67326b7ad 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class ImageProcessorMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From bd890d599d3929403c68cd65164200e695bf56e9 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:18:18 +0100
Subject: [PATCH 013/162] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    | 24 +++++++++++++++
 src/transformers/image_transforms.py          | 29 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/internal/image_processing_utils.mdx

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
new file mode 100644
index 00000000000000..ae4f826517aa1c
--- /dev/null
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Image Processors
+
+This page lists all the utility functions used by the image processors, mainly the functional
+transformations used to process the images.
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Image Transformations
+
+[[autodoc]] image_transforms.to_pil_image
+
+[[autodoc]] image_transforms.resize
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7551a431b4ba79..38640029eaa0d9 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -81,10 +81,37 @@ def to_pil_image(
 
 def get_resize_output_image_size(
     input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
     max_size: int = None,
 ) -> np.ndarray:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            The filter to user for resampling.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*, defaults to `None`):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+    """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
             return size

From 4b27a340e451b0a402e78886f965c1c617f1fcf4 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 014/162] Fixup and docs

---
 docs/source/en/internal/image_processing_utils.mdx | 5 +++++
 src/transformers/__init__.py                       | 2 ++
 src/transformers/utils/dummy_vision_objects.py     | 7 +++++++
 3 files changed, 14 insertions(+)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index ae4f826517aa1c..4d5831a12fd623 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo
 [[autodoc]] image_transforms.to_pil_image
 
 [[autodoc]] image_transforms.resize
+
+
+## ImageProcessorMixin
+
+[[autodoc]] image_processing_utils.ImageProcessorMixin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dcacc328e8a8a3..a5c3e0d9057515 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -632,6 +632,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
     _import_structure["image_transforms"] = ["resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
@@ -3340,6 +3341,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
+        from .image_processing_utils import ImageProcessorMixin
         from .image_transforms import resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 0d9da1b7a16d42..5756f67326b7ad 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class ImageProcessorMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From ff0d49ec10c4d7ecda79248567a88a4285efbbd3 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:30:59 +0100
Subject: [PATCH 015/162] BatchFeature for packaging image processor outputs

---
 src/transformers/image_processing_utils.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 0e8b02c56b52d5..5370b9d35cad77 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -15,8 +15,8 @@
 
 import copy
 import json
-import os
-from typing import Any, Dict, Tuple, Union
+from collections import UserDict
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 
@@ -31,13 +31,19 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_offline_mode,
+    is_flax_available,
     is_remote_url,
+    is_torch_available,
+    is_tf_available,
+    is_offline_mode,
     logging,
+    torch_required,
 )
+from .utils.generic import _is_jax, _is_numpy, _is_torch_device
 
 
 logger = logging.get_logger(__name__)

From 2c2fa9aa63f734f67fde6876fc1db5a961020e08 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:59:34 +0100
Subject: [PATCH 016/162] Import BatchFeature from feature_extraction_utils

---
 src/transformers/image_processing_utils.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 5370b9d35cad77..9aa4bebf89bf6e 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -15,8 +15,8 @@
 
 import copy
 import json
-from collections import UserDict
-from typing import Any, Dict, Optional, Tuple, Union
+import os
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
 
@@ -31,19 +31,14 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_torch_available,
-    is_tf_available,
     is_offline_mode,
+    is_remote_url,
     logging,
-    torch_required,
 )
-from .utils.generic import _is_jax, _is_numpy, _is_torch_device
+
 
 
 logger = logging.get_logger(__name__)

From 7faf2e69167453d8c095d62790b3cd79b7ec3987 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:59:34 +0100
Subject: [PATCH 017/162] Import BatchFeature from feature_extraction_utils

---
 src/transformers/image_processing_utils.py | 173 +++------------------
 src/transformers/image_transforms.py       |  41 +++--
 src/transformers/image_utils.py            |   6 +-
 3 files changed, 40 insertions(+), 180 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 2f1377b3773f0f..872ce352c3de1c 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -14,16 +14,16 @@
 # limitations under the License.
 
 import copy
-import os
 import json
-from collections import UserDict
-from typing import Any, Dict, Optional, Tuple, Union
+import os
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
+
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
-from .image_utils import ImageType
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     IMAGE_PROCESSOR_NAME,
@@ -31,24 +31,21 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_torch_available,
-    is_tf_available,
     is_offline_mode,
+    is_remote_url,
     logging,
-    torch_required,
 )
-from .utils.generic import _is_jax, _is_numpy, _is_torch_device
+
 
 logger = logging.get_logger(__name__)
 
 
-class BatchFeature(UserDict):
+# TODO: Move BatchFeature to be imported by both feature_extraction_utils and image_processing_utils
+# We override the class string here, but logic is the same.
+class BatchFeature(BaseBatchFeature):
     r"""
     Holds the output of the image processor specific `__call__` methods.
 
@@ -56,143 +53,13 @@ class BatchFeature(UserDict):
 
     Args:
         data (`dict`):
-            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask',
             etc.).
         tensor_type (`Union[None, str, TensorType]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
             initialization.
     """
 
-    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
-        super().__init__(data)
-        self.convert_to_tensors(tensor_type=tensor_type)
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__
-    def __getitem__(self, item: str) -> Any:
-        """
-        If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask',
-        etc.).
-        """
-        if isinstance(item, str):
-            return self.data[item]
-        else:
-            raise KeyError("Indexing with integers is not available when using Python based feature extractors")
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__
-    def __getattr__(self, item: str):
-        try:
-            return self.data[item]
-        except KeyError:
-            raise AttributeError
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__
-    def __getstate__(self):
-        return {"data": self.data}
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__
-    def __setstate__(self, state):
-        if "data" in state:
-            self.data = state["data"]
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.keys
-    def keys(self):
-        return self.data.keys()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.values
-    def values(self):
-        return self.data.values()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.items
-    def items(self):
-        return self.data.items()
-
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, ImageType]] = None):
-        """
-        Convert the inner content to tensors.
-
-        Args:
-            tensor_type (`str` or [`~utils.ImageType`], *optional*):
-                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.ImageType`]. If
-                `None`, no modification is done.
-        """
-        if tensor_type is None:
-            return self
-
-        # Convert to TensorType
-        if not isinstance(tensor_type, ImageType):
-            tensor_type = ImageType(tensor_type)
-
-        # Get a function reference for the correct framework
-        if tensor_type == ImageType.TENSORFLOW:
-            if not is_tf_available():
-                raise ImportError(
-                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
-                )
-            import tensorflow as tf
-
-            as_tensor = tf.constant
-            is_tensor = tf.is_tensor
-        elif tensor_type == ImageType.PYTORCH:
-            if not is_torch_available():
-                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
-            import torch
-
-            def as_tensor(value):
-                if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray):
-                    value = np.array(value)
-                return torch.tensor(value)
-
-            is_tensor = torch.is_tensor
-        elif tensor_type == ImageType.JAX:
-            if not is_flax_available():
-                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
-            import jax.numpy as jnp  # noqa: F811
-
-            as_tensor = jnp.array
-            is_tensor = _is_jax
-        else:
-            as_tensor = np.asarray
-            is_tensor = _is_numpy
-
-        # Do the tensor conversion in batch
-        for key, value in self.items():
-            try:
-                if not is_tensor(value):
-                    tensor = as_tensor(value)
-
-                    self[key] = tensor
-            except:  # noqa E722
-                if key == "overflowing_values":
-                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
-                raise ValueError(
-                    "Unable to create tensor, you should probably activate padding "
-                    "with 'padding=True' to have batched tensors with the same length."
-                )
-
-        return self
-
-    @torch_required
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
-    def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
-        """
-        Send all values to device by calling `v.to(device)` (PyTorch only).
-
-        Args:
-            device (`str` or `torch.device`): The device to put the tensors on.
-
-        Returns:
-            [`BatchFeature`]: The same instance after modification.
-        """
-
-        # This check catches things like APEX blindly calling "to" on all inputs to a module
-        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
-        # into a HalfTensor
-        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items()}
-        else:
-            logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.")
-        return self
-
 
 class ImageProcessorMixin(PushToHubMixin):
     """
@@ -218,12 +85,10 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
-        derived class of [`BaseImageProcessor`].
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived
+        class of [`BaseImageProcessor`].
 
         Args:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -241,8 +106,8 @@ def from_pretrained(
                 Path to a directory in which a downloaded pretrained model image processor should be cached if the
                 standard cache should not be used.
             force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions
-                if they exist.
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
             resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
@@ -496,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 55d788d4753175..e7e99c09f8500c 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,48 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, List, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
-import PIL
 import numpy as np
+import PIL
+
+from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
     get_image_size,
     infer_channel_dimension,
-    is_torch_tensor,
+    is_jax_tensor,
     is_tf_tensor,
-    is_jax_tensor
+    is_torch_tensor,
 )
 
 
-def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray:
-    """
-    Rescales `image` by `scale`.
-
-    Args:
-        image (`np.ndarray``):
-            The image to rescale.
-        scale (`float`, `int`):
-            The scale to use for rescaling the image.
-
-    Returns:
-        image: A rescaled np.ndarray image.
-    """
-    return image * scale
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+    if is_flax_available():
+        import jax.numpy as jnp
 
 
 def to_pil_image(
-    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
+    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     channel_dim: Optional[ChannelDimension] = None,
-    do_rescale: Optional[bool] = None
+    rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
     needed.
 
     Args:
-        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+        image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`):
             The image to convert to the PIL Image format.
         rescale (`bool`, *optional*):
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
@@ -69,7 +64,7 @@ def to_pil_image(
         image = np.array(image)
 
     if not isinstance(image, np.ndarray):
-        raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor")
+        raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
     channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim
@@ -88,7 +83,7 @@ def get_resize_output_image_size(
     input_image: np.ndarray,
     size: Union[int, Tuple[int, int], List[int]],
     default_to_square: bool = True,
-    max_size: int = None
+    max_size: int = None,
 ) -> np.ndarray:
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e5afb3122bcd03..8fdf7aadac4dec 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import TensorType, is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import ExplicitEnum, _is_torch, _is_tensorflow, _is_jax, _is_numpy, to_numpy
+from .utils import is_flax_available, is_tf_available, is_torch_available
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -61,7 +61,7 @@ def is_tf_tensor(obj):
 
 
 def is_jax_tensor(obj):
-    return _is_jax(obj) if is_jax_available() else False
+    return _is_jax(obj) if is_flax_available() else False
 
 
 def is_valid_image(img):

From ccc15fb6887f748b98c67a3ad4521cfb96a0ddf7 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:18:18 +0100
Subject: [PATCH 018/162] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    | 24 +++++++++++++++
 src/transformers/image_transforms.py          | 29 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/internal/image_processing_utils.mdx

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
new file mode 100644
index 00000000000000..ae4f826517aa1c
--- /dev/null
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Image Processors
+
+This page lists all the utility functions used by the image processors, mainly the functional
+transformations used to process the images.
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Image Transformations
+
+[[autodoc]] image_transforms.to_pil_image
+
+[[autodoc]] image_transforms.resize
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index e7e99c09f8500c..c7e9c0ec9e20f1 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -81,10 +81,37 @@ def to_pil_image(
 
 def get_resize_output_image_size(
     input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
     max_size: int = None,
 ) -> np.ndarray:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            The filter to user for resampling.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*, defaults to `None`):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+    """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
             return size

From c8f8eb6e0c918ea2f82b0f0faba141489a508c42 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 019/162] Fixup and docs

---
 docs/source/en/internal/image_processing_utils.mdx | 5 +++++
 src/transformers/__init__.py                       | 2 ++
 src/transformers/utils/dummy_vision_objects.py     | 7 +++++++
 3 files changed, 14 insertions(+)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index ae4f826517aa1c..4d5831a12fd623 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo
 [[autodoc]] image_transforms.to_pil_image
 
 [[autodoc]] image_transforms.resize
+
+
+## ImageProcessorMixin
+
+[[autodoc]] image_processing_utils.ImageProcessorMixin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dcacc328e8a8a3..a5c3e0d9057515 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -632,6 +632,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
     _import_structure["image_transforms"] = ["resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
@@ -3340,6 +3341,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
+        from .image_processing_utils import ImageProcessorMixin
         from .image_transforms import resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 0d9da1b7a16d42..5756f67326b7ad 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class ImageProcessorMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From 90093f4b989869cc4191c26ba27cd428af521cae Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:30:59 +0100
Subject: [PATCH 020/162] BatchFeature for packaging image processor outputs

---
 src/transformers/image_processing_utils.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 872ce352c3de1c..587753856420b8 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -15,8 +15,8 @@
 
 import copy
 import json
-import os
-from typing import Any, Dict, Tuple, Union
+from collections import UserDict
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 
@@ -31,13 +31,19 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_offline_mode,
+    is_flax_available,
     is_remote_url,
+    is_torch_available,
+    is_tf_available,
+    is_offline_mode,
     logging,
+    torch_required,
 )
+from .utils.generic import _is_jax, _is_numpy, _is_torch_device
 
 
 logger = logging.get_logger(__name__)

From d89c0513ba7e52c1de91870c6179d392a0d7114b Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:59:34 +0100
Subject: [PATCH 021/162] Import BatchFeature from feature_extraction_utils

---
 src/transformers/image_processing_utils.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 587753856420b8..38778039aee2a3 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -15,8 +15,8 @@
 
 import copy
 import json
-from collections import UserDict
-from typing import Any, Dict, Optional, Tuple, Union
+import os
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
 
@@ -31,19 +31,14 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_torch_available,
-    is_tf_available,
     is_offline_mode,
+    is_remote_url,
     logging,
-    torch_required,
 )
-from .utils.generic import _is_jax, _is_numpy, _is_torch_device
+
 
 
 logger = logging.get_logger(__name__)

From 9bc91578fd60c5a2662440946807162159c327e8 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 022/162] Fixup and docs

---
 src/transformers/image_processing_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 38778039aee2a3..872ce352c3de1c 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -40,7 +40,6 @@
 )
 
 
-
 logger = logging.get_logger(__name__)
 
 

From 6ec382acb30842b9a1871e907c923b87293ca63e Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 15:55:48 +0100
Subject: [PATCH 023/162] Mixin for saving the image processor

---
 src/transformers/image_processing_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 872ce352c3de1c..6e7c8e530d72b7 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -361,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
-        file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
+        a JSON file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
-            instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
+            object instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()

From 56ee6ad282f0d17f3e100655ecef58b0ae0cb354 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 024/162] Fixup and docs

---
 src/transformers/image_processing_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 6e7c8e530d72b7..872ce352c3de1c 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -361,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()

From 6b88d5f8e09d3ea7d22317eb241df67b7286697d Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 15:57:24 +0100
Subject: [PATCH 025/162] Add rescale back and remove ImageType

---
 .../en/internal/image_processing_utils.mdx    |  5 ++-
 src/transformers/__init__.py                  |  4 +--
 src/transformers/image_transforms.py          | 18 ++++++++++-
 src/transformers/image_utils.py               | 32 ++-----------------
 .../models/glpn/image_processing_glpn.py      | 23 +++++++++----
 .../utils/dummy_vision_objects.py             |  4 +++
 6 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index 4d5831a12fd623..8bdf0ed11099b2 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -19,10 +19,13 @@ Most of those are only useful if you are studying the code of the image processo
 
 ## Image Transformations
 
-[[autodoc]] image_transforms.to_pil_image
+[[autodoc]] image_transforms.rescale
 
 [[autodoc]] image_transforms.resize
 
+[[autodoc]] image_transforms.to_pil_image
+
+
 
 ## ImageProcessorMixin
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a5c3e0d9057515..9b34e4cea7f7eb 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -633,7 +633,7 @@
     ]
 else:
     _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
-    _import_structure["image_transforms"] = ["resize", "to_pil_image"]
+    _import_structure["image_transforms"] = ["rescale", "resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
     _import_structure["models.clip"].append("CLIPFeatureExtractor")
@@ -3342,7 +3342,7 @@
         from .utils.dummy_vision_objects import *
     else:
         from .image_processing_utils import ImageProcessorMixin
-        from .image_transforms import resize, to_pil_image
+        from .image_transforms import rescale, resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor, CLIPProcessor
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index c7e9c0ec9e20f1..b15a1372a9532c 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -39,10 +39,26 @@
         import jax.numpy as jnp
 
 
+def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray``):
+            The image to rescale.
+        scale (`float`, `int`):
+            The scale to use for rescaling the image.
+
+    Returns:
+        image: A rescaled np.ndarray image.
+    """
+    return image * scale
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None,
+    do_rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 8fdf7aadac4dec..3bb72816ced225 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -24,7 +24,7 @@
 import requests
 
 from .utils import is_flax_available, is_tf_available, is_torch_available
-from .utils.generic import _is_jax, _is_tensorflow, _is_torch
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch, to_numpy
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -36,22 +36,12 @@
     PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
 ]
 
+
 class ChannelDimension(enum.Enum):
     FIRST = 1
     LAST = 3
 
 
-class ImageType(ExplicitEnum):
-    """
-    Possible image data formats that can be fed into an image processor
-    """
-    PYTORCH = "pt"
-    TENSORFLOW = "tf"
-    NUMPY = "np"
-    JAX = "jax"
-    PIL = "pillow"
-
-
 def is_torch_tensor(obj):
     return _is_torch(obj) if is_torch_available() else False
 
@@ -83,24 +73,8 @@ def is_batched(img):
     return False
 
 
-def get_image_type(obj) -> TensorType:
-    if is_torch_tensor(obj):
-        return TensorType.TORCH
-    elif is_tf_tensor(obj):
-        return TensorType.TF
-    elif is_jax_tensor(obj):
-        return TensorType.JAX
-    elif _is_numpy(obj):
-        return TensorType.NUMPY
-    elif isinstance(obj, PIL.Image.Image):
-        return TensorType.PIL
-    else:
-        raise ValueError("Could not infer tensor type")
-
-
 def to_numpy_array(img) -> np.ndarray:
-    input_type = get_image_type(img)
-    if input_type == ImageType.PIL:
+    if isinstance(img, PIL.Image.Image):
         return np.array(img)
     return to_numpy(img)
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index bdc33e80ce9f87..c07163a7e1d9d6 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -19,23 +19,30 @@
 import PIL.Image
 from numpy import np
 
+from transformers.utils.generic import TensorType
+
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import resize, rescale
-from ...image_utils import ImageType, is_batched, to_numpy_array, valid_images, get_image_size
+from ...image_transforms import rescale, resize
+from ...image_utils import get_image_size, is_batched, to_numpy_array, valid_images
 from ...utils import logging
 
+
 logger = logging.get_logger(__name__)
 
 
 class GLPNImageProcessor(BaseImageProcessor):
-    def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs) -> None:
+    def __init__(
+        self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs
+    ) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale
         self.size_divisor = size_divisor
         self.resample = resample
         super().__init__(**kwargs)
 
-    def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs) -> np.ndarray:
+    def resize(
+        self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs
+    ) -> np.ndarray:
         height, width = get_image_size(image)
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor
@@ -45,14 +52,16 @@ def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: P
     def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray:
         return rescale(image, scale, **kwargs)
 
-    def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs) -> BatchFeature:
+    def preprocess(
+        self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs
+    ) -> BatchFeature:
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         size_divisor = size_divisor if size_divisor is not None else self.size_divisor
         resample = resample if resample is not None else self.resample
 
         # If a return type isn't specified, default to numpy arrays.
-        return_tensors = ImageType.NUMPY if return_tensors is None else return_tensors
+        return_tensors = TensorType.NUMPY if return_tensors is None else return_tensors
 
         if do_resize and size_divisor is None:
             raise ValueError("size_divisor is required for resizing")
@@ -70,7 +79,7 @@ def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None,
             images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
 
         if do_rescale:
-            images = [self.rescale(image) for image in images]
+            images = [self.rescale(image, scale=255) for image in images]
 
         data = {"pixel_values": images}
         return BatchFeature(**data, return_tensors=return_tensors)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 5756f67326b7ad..6622564eafd6c5 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -10,6 +10,10 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+def rescale(*args, **kwargs):
+    requires_backends(rescale, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From 67077f1de95425da52541078f0ce0eb121158889 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 16:19:42 +0100
Subject: [PATCH 026/162] fix import mistake

---
 src/transformers/models/glpn/image_processing_glpn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index c07163a7e1d9d6..8ba6568bfcf024 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -17,7 +17,7 @@
 from typing import Union
 
 import PIL.Image
-from numpy import np
+import numpy as np
 
 from transformers.utils.generic import TensorType
 

From 82712c76526385991340d552dca8afd84d563ab3 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 16:21:13 +0100
Subject: [PATCH 027/162] Fix enum var reference

---
 src/transformers/image_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 9112506e8ebffa..3eb6ee813dd62f 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -66,7 +66,7 @@ def to_pil_image(
 
     # If the channel as been moved to first dim, we put it back at the end.
     channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim
-    if channel_dim == ChannelDimension.CHANNEL_FIRST:
+    if channel_dim == ChannelDimension.FIRST:
         image = image.transpose((1, 2, 0))
 
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.

From b997a9848e93a672e94f6b0d3ce6763857c1e1f5 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 18:15:44 +0100
Subject: [PATCH 028/162] Can transform and specify image data format

---
 src/transformers/image_transforms.py | 86 ++++++++++++++++++++++++----
 src/transformers/image_utils.py      |  9 ++-
 2 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 3eb6ee813dd62f..daa8264d89b441 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -37,10 +37,60 @@
         import tensorflow as tf
 
 
+def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDimension, str]) -> np.ndarray:
+    """
+    Converts `image` to the channel dimension format specified by `channel_dim`.
+
+    Args:
+        image (`numpy.ndarray`):
+            The image to convert to the PIL Image format.
+        channel_dim (`ChannelDimension`):
+            The channel dimension format to use.
+
+    Returns:
+        image: A converted np.ndarray.
+    """
+    current_channel_dim = infer_channel_dimension(image)
+    target_channel_dim = ChannelDimension(channel_dim)
+    if current_channel_dim == target_channel_dim:
+        return image
+
+    if target_channel_dim == ChannelDimension.FIRST:
+        return image.transpose((2, 0, 1))
+
+    if target_channel_dim == ChannelDimension.LAST:
+        return image.transpose((1, 2, 0))
+
+    raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
+
+
+def get_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
+    """
+    Infers the channel dimension format of `image`.
+
+    Args:
+        image (`numpy.ndarray`):
+            The image to get the channel dimension format of.
+
+    Returns:
+        channel_dim: The channel dimension format of `image`.
+    """
+    if image.ndim == 3:
+        first_dim, last_dim = 0, 2
+    elif image.ndim == 4:
+        first_dim, last_dim = 1, 3
+    else:
+        raise ValueError("Unsupported number of image dimensions: {}".format(image.ndim))
+
+    if image[first_dim] in (1, 3):
+        return ChannelDimension.FIRST
+    elif image[last_dim] in (1, 3):
+        return ChannelDimension.LAST
+    raise Exception("Unable to infer channel dimension format")
+
+
 def to_pil_image(
-    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
-    channel_dim: Optional[ChannelDimension] = None,
-    rescale=None,
+    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"], rescale=None
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -65,9 +115,7 @@ def to_pil_image(
         raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor")
 
     # If the channel as been moved to first dim, we put it back at the end.
-    channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim
-    if channel_dim == ChannelDimension.FIRST:
-        image = image.transpose((1, 2, 0))
+    image = to_channel_dimension_format(image, ChannelDimension.LAST)
 
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
     rescale = isinstance(image.flat[0], float) if rescale is None else rescale
@@ -143,9 +191,14 @@ def get_resize_output_image_size(
     return (new_short, new_long) if width <= height else (new_long, new_short)
 
 
-def resize(image, size: Tuple[int, int], resample=PIL.Image.BILINEAR):
+def resize(
+    image,
+    size: Tuple[int, int],
+    resample=PIL.Image.Resampling.BILINEAR,
+    data_format: Optional[ChannelDimension] = None,
+) -> np.np.ndarray:
     """
-    Resizes `image`. Enforces conversion of input to PIL.Image.
+    Resizes `image` to (h, w) specified by `size` using the PIL library.
 
     Args:
         image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
@@ -154,13 +207,26 @@ def resize(image, size: Tuple[int, int], resample=PIL.Image.BILINEAR):
             The size to use for resizing the image.
         resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
             The filter to user for resampling.
+        data_format (`ChannelDimension`, *optional*, defaults to `None`):
+            The channel dimension format of the output image. If `None`, will use the inferred format from the input.
 
     Returns:
         image: A resized np.ndarray.
     """
+    if not len(size) == 2:
+        raise ValueError("size must have 2 elements")
+
+    # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+    # The resized image from PIL will always have channels last, so find the input format first.
+    data_format = get_channel_dimension_format(image) if data_format is None else data_format
+
     # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
     # the pillow library to resize the image and then convert back to numpy
     if not isinstance(image, PIL.Image.Image):
         image = to_pil_image(image)
-    resized_image = image.resize(size, resample=resample)
-    return resized_image.numpy()
+    # PIL images are in the format (width, height)
+    h, w = size
+    resized_image = image.resize((w, h), resample=resample)
+    resized_image = np.array(resized_image)
+    resized_image = to_channel_dimension_format(resized_image, data_format)
+    return resized_image
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 15bcf99542614c..a3a12fc7500f9a 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import enum
 import os
 from typing import List, Tuple, Union
 
@@ -24,7 +23,7 @@
 import requests
 
 from .utils import is_flax_available, is_tf_available, is_torch_available
-from .utils.generic import _is_jax, _is_tensorflow, _is_torch
+from .utils.generic import ExplicitEnum, _is_jax, _is_tensorflow, _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -49,9 +48,9 @@ def is_jax_tensor(obj):
     return _is_jax(obj) if is_flax_available() else False
 
 
-class ChannelDimension(enum.Enum):
-    FIRST = 1
-    LAST = 3
+class ChannelDimension(ExplicitEnum):
+    FIRST = "channels_first"
+    LAST = "channels_last"
 
 
 def infer_channel_dimension(image: np.ndarray) -> ChannelDimension:

From 1b3cf656fcbb4001ecd82f2cbecf8b32d6a0ce71 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 18:19:24 +0100
Subject: [PATCH 029/162] Remove redundant function

---
 src/transformers/image_transforms.py | 31 +++-------------------------
 src/transformers/image_utils.py      | 18 +++++++---------
 2 files changed, 11 insertions(+), 38 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index daa8264d89b441..0198d029f8cfd6 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -23,7 +23,7 @@
 from .image_utils import (
     ChannelDimension,
     get_image_size,
-    infer_channel_dimension,
+    infer_channel_dimension_format,
     is_jax_tensor,
     is_tf_tensor,
     is_torch_tensor,
@@ -50,7 +50,7 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
     Returns:
         image: A converted np.ndarray.
     """
-    current_channel_dim = infer_channel_dimension(image)
+    current_channel_dim = infer_channel_dimension_format(image)
     target_channel_dim = ChannelDimension(channel_dim)
     if current_channel_dim == target_channel_dim:
         return image
@@ -64,31 +64,6 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
     raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
 
 
-def get_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
-    """
-    Infers the channel dimension format of `image`.
-
-    Args:
-        image (`numpy.ndarray`):
-            The image to get the channel dimension format of.
-
-    Returns:
-        channel_dim: The channel dimension format of `image`.
-    """
-    if image.ndim == 3:
-        first_dim, last_dim = 0, 2
-    elif image.ndim == 4:
-        first_dim, last_dim = 1, 3
-    else:
-        raise ValueError("Unsupported number of image dimensions: {}".format(image.ndim))
-
-    if image[first_dim] in (1, 3):
-        return ChannelDimension.FIRST
-    elif image[last_dim] in (1, 3):
-        return ChannelDimension.LAST
-    raise Exception("Unable to infer channel dimension format")
-
-
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"], rescale=None
 ) -> PIL.Image.Image:
@@ -218,7 +193,7 @@ def resize(
 
     # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
     # The resized image from PIL will always have channels last, so find the input format first.
-    data_format = get_channel_dimension_format(image) if data_format is None else data_format
+    data_format = infer_channel_dimension_format(image) if data_format is None else data_format
 
     # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
     # the pillow library to resize the image and then convert back to numpy
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index a3a12fc7500f9a..9eceb3a032a98a 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -53,9 +53,9 @@ class ChannelDimension(ExplicitEnum):
     LAST = "channels_last"
 
 
-def infer_channel_dimension(image: np.ndarray) -> ChannelDimension:
+def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
     """
-    Infers the channel dimension of the image.
+    Infers the channel dimension format of `image`.
 
     Args:
         image (`np.ndarray`):
@@ -65,19 +65,17 @@ def infer_channel_dimension(image: np.ndarray) -> ChannelDimension:
         The channel dimension of the image.
     """
     if image.ndim == 3:
-        first_dim = 0
-        last_dim = 2
+        first_dim, last_dim = 0, 2
     elif image.ndim == 4:
-        first_dim = 1
-        last_dim = 3
+        first_dim, last_dim = 1, 3
     else:
-        raise ValueError(f"Unsupported image dimension: {image.ndim}")
+        raise ValueError("Unsupported number of image dimensions: {}".format(image.ndim))
 
-    if image.shape[first_dim] in (1, 3):
+    if image[first_dim] in (1, 3):
         return ChannelDimension.FIRST
-    elif image.shape[last_dim] in (1, 3):
+    elif image[last_dim] in (1, 3):
         return ChannelDimension.LAST
-    raise Exception("Could not infer channel dimension")
+    raise Exception("Unable to infer channel dimension format")
 
 
 def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:

From 286046004cb0fdc8edc537ed91d7a0ba6d250754 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 18:20:35 +0100
Subject: [PATCH 030/162] Update reference

---
 src/transformers/image_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 9eceb3a032a98a..018a8c6267c5a9 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -92,7 +92,7 @@ def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> T
         A tuple of the image's height and width.
     """
     if channel_dim is None:
-        channel_dim = infer_channel_dimension(image)
+        channel_dim = infer_channel_dimension_format(image)
 
     if channel_dim == ChannelDimension.FIRST:
         return image.shape[-2], image.shape[-1]

From 60c56e5dc4d428b83ca1f23b4162596232ade716 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 15:53:37 +0100
Subject: [PATCH 031/162] Data format flag for rescale

---
 src/transformers/image_transforms.py | 37 ++++++++++++++++------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 16b297f3abfc21..950a75e08141d1 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -39,22 +39,6 @@
         import jax.numpy as jnp
 
 
-def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray:
-    """
-    Rescales `image` by `scale`.
-
-    Args:
-        image (`np.ndarray``):
-            The image to rescale.
-        scale (`float`, `int`):
-            The scale to use for rescaling the image.
-
-    Returns:
-        image: A rescaled np.ndarray image.
-    """
-    return image * scale
-
-
 def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDimension, str]) -> np.ndarray:
     """
     Converts `image` to the channel dimension format specified by `channel_dim`.
@@ -82,6 +66,27 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
     raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
 
 
+def rescale(
+    image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None
+) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray``):
+            The image to rescale.
+        scale (`float`, `int`):
+            The scale to use for rescaling the image.
+
+    Returns:
+        image: A rescaled np.ndarray image.
+    """
+    rescaled_image = image * scale
+    if data_format is not None:
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+    return rescaled_image
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.Tensor"], do_rescale=None
 ) -> PIL.Image.Image:

From 9294dbcef3e4cebb4ac68efd614c47e8ee8a6638 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 15:59:31 +0100
Subject: [PATCH 032/162] Fix typo

---
 src/transformers/image_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 950a75e08141d1..1863c01d60ecee 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -194,7 +194,7 @@ def resize(
     size: Tuple[int, int],
     resample=PIL.Image.Resampling.BILINEAR,
     data_format: Optional[ChannelDimension] = None,
-) -> np.np.ndarray:
+) -> np.ndarray:
     """
     Resizes `image` to (h, w) specified by `size` using the PIL library.
 

From 654cf93230788364c56badcd629fcec67f771c9d Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 16:39:16 +0100
Subject: [PATCH 033/162] Fix dimension check

---
 src/transformers/image_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 018a8c6267c5a9..8214d9e6bc5734 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -71,9 +71,9 @@ def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
     else:
         raise ValueError("Unsupported number of image dimensions: {}".format(image.ndim))
 
-    if image[first_dim] in (1, 3):
+    if image.shape[first_dim] in (1, 3):
         return ChannelDimension.FIRST
-    elif image[last_dim] in (1, 3):
+    elif image.shape[last_dim] in (1, 3):
         return ChannelDimension.LAST
     raise Exception("Unable to infer channel dimension format")
 

From 88b82e936793a2746c56bdc0c47c3c4b4c8b1590 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 16:47:43 +0100
Subject: [PATCH 034/162] Fixes to make IP and FE outputs match

---
 src/transformers/image_transforms.py          |  8 +++-
 .../models/glpn/image_processing_glpn.py      | 37 ++++++++++++++-----
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 1863c01d60ecee..9f7e1f48520f6e 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -67,7 +67,7 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
 
 
 def rescale(
-    image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None
+    image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None, dtype=np.float32
 ) -> np.ndarray:
     """
     Rescales `image` by `scale`.
@@ -77,6 +77,11 @@ def rescale(
             The image to rescale.
         scale (`float`, `int`):
             The scale to use for rescaling the image.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the image. If not provided, it will be the same as the input image.
+        dtype (`np.dtype`, *optional*):
+            The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility
+            with feature extractors
 
     Returns:
         image: A rescaled np.ndarray image.
@@ -84,6 +89,7 @@ def rescale(
     rescaled_image = image * scale
     if data_format is not None:
         rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+    rescaled_image = rescaled_image.astype(dtype)
     return rescaled_image
 
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index cce1dccf6aff7c..6ab3d85221eb84 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Image processor class for GLPN."""
 
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 import PIL.Image
@@ -22,8 +22,8 @@
 from transformers.utils.generic import TensorType
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import rescale, resize
-from ...image_utils import get_image_size, is_batched, to_numpy_array, valid_images
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import ChannelDimension, get_image_size, is_batched, to_numpy_array, valid_images
 from ...utils import logging
 
 
@@ -41,19 +41,34 @@ def __init__(
         super().__init__(**kwargs)
 
     def resize(
-        self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs
+        self,
+        image: np.ndarray,
+        size_divisor: Union[int, float],
+        resample: PIL.Image.Resampling,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs
     ) -> np.ndarray:
         height, width = get_image_size(image)
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor
-        image = resize(image, (new_h, new_w), resample=resample, **kwargs)
+        image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
         return image
 
-    def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray:
-        return rescale(image, scale, **kwargs)
+    def rescale(
+        self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs
+    ) -> np.ndarray:
+        return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
 
     def preprocess(
-        self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs
+        self,
+        images,
+        do_resize: bool = None,
+        do_rescale: bool = None,
+        size_divisor: int = None,
+        resample: PIL.Image.Resampling = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs
     ) -> BatchFeature:
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
@@ -79,7 +94,9 @@ def preprocess(
             images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
 
         if do_rescale:
-            images = [self.rescale(image, scale=255) for image in images]
+            images = [self.rescale(image, scale=1/255) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
 
         data = {"pixel_values": images}
-        return BatchFeature(**data, return_tensors=return_tensors)
+        return BatchFeature(data=data, tensor_type=return_tensors)

From 3ea27aa989bf8091bcd0032d25ccef5dde87db76 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 18:23:14 +0100
Subject: [PATCH 035/162] Add tests for transforms

---
 src/transformers/image_transforms.py |   8 +-
 tests/test_image_transforms.py       | 128 +++++++++++++++++++++++++++
 utils/tests_fetcher.py               |   1 +
 3 files changed, 133 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_image_transforms.py

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 0198d029f8cfd6..aed291b56365fc 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -105,7 +105,7 @@ def get_resize_output_image_size(
     size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
     max_size: int = None,
-) -> np.ndarray:
+) -> tuple:
     """
     Find the target (height, width) dimension of the output image after resizing given the input image and the desired
     size.
@@ -135,7 +135,7 @@ def get_resize_output_image_size(
     """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
-            return size
+            return tuple(size)
         elif len(size) == 1:
             # Perform same logic as if size was an int
             size = size[0]
@@ -163,7 +163,7 @@ def get_resize_output_image_size(
         if new_long > max_size:
             new_short, new_long = int(max_size * new_short / new_long), max_size
 
-    return (new_short, new_long) if width <= height else (new_long, new_short)
+    return (new_long, new_short) if width <= height else (new_short, new_long)
 
 
 def resize(
@@ -171,7 +171,7 @@ def resize(
     size: Tuple[int, int],
     resample=PIL.Image.Resampling.BILINEAR,
     data_format: Optional[ChannelDimension] = None,
-) -> np.np.ndarray:
+) -> np.ndarray:
     """
     Resizes `image` to (h, w) specified by `size` using the PIL library.
 
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
new file mode 100644
index 00000000000000..16bdcae09f6ffd
--- /dev/null
+++ b/tests/test_image_transforms.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from parameterized import parameterized
+from transformers.testing_utils import require_vision
+from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import torch
+
+if is_tf_available():
+    import tensorflow as tf
+
+if is_flax_available():
+    import jax.numpy as jnp
+
+if is_vision_available():
+    import PIL.Image
+
+    from transformers.image_transforms import (
+        get_resize_output_image_size,
+        resize,
+        to_channel_dimension_format,
+        to_pil_image,
+    )
+
+
+def get_random_image(height, width, num_channels=3, channels_first=True):
+    shape = (num_channels, height, width) if channels_first else (height, width, num_channels)
+    random_array = np.random.randint(0, 256, shape, dtype=np.uint8)
+    return random_array
+
+
+@require_vision
+class ImageTransformsTester(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("numpy_float_channels_first", True, lambda x: x, (3, 4, 5), np.float32),
+            ("numpy_float_channels_last", True, lambda x: x, (4, 5, 3), np.float32),
+            ("numpy_int_channels_first", True, lambda x: x, (3, 4, 5), np.int32),
+            ("numpy_uint_channels_first", True, lambda x: x, (3, 4, 5), np.uint8),
+            ("tensorflow", is_tf_available, tf.convert_to_tensor, (3, 4, 5), np.float32),
+            ("torch", is_torch_available, torch.tensor, (3, 4, 5), np.float32),
+            ("jax", is_flax_available, jnp.array, (3, 4, 5), np.float32),
+        ]
+    )
+    def test_to_pil_image(self, name, is_library_available, to_tensor, image_shape, dtype):
+        image = np.random.randint(0, 256, image_shape).astype(dtype)
+        if is_library_available:
+            image = to_tensor(image)
+            pil_image = to_pil_image(image)
+            self.assertIsInstance(pil_image, PIL.Image.Image)
+
+    def test_to_channel_dimension_format(self):
+        # Test that function doesn't reorder if channel dim matches the input.
+        image = np.random.rand(3, 4, 5)
+        image = to_channel_dimension_format(image, "channels_first")
+        self.assertEqual(image.shape, (3, 4, 5))
+
+        image = np.random.rand(4, 5, 3)
+        image = to_channel_dimension_format(image, "channels_last")
+        self.assertEqual(image.shape, (4, 5, 3))
+
+        # Test that function reorders if channel dim doesn't match the input.
+        image = np.random.rand(3, 4, 5)
+        image = to_channel_dimension_format(image, "channels_last")
+        self.assertEqual(image.shape, (4, 5, 3))
+
+        image = np.random.rand(4, 5, 3)
+        image = to_channel_dimension_format(image, "channels_first")
+        self.assertEqual(image.shape, (3, 4, 5))
+
+    def test_get_resize_output_image_size(self):
+        image = np.random.randint(0, 256, (3, 224, 224))
+
+        # Test the output size defaults to (x, x) if an int is given.
+        self.assertEqual(get_resize_output_image_size(image, 10), (10, 10))
+        self.assertEqual(get_resize_output_image_size(image, [10]), (10, 10))
+        self.assertEqual(get_resize_output_image_size(image, (10,)), (10, 10))
+
+        # Test the output size is the same as the input if a two element tuple/list is given.
+        self.assertEqual(get_resize_output_image_size(image, (10, 20)), (10, 20))
+        self.assertEqual(get_resize_output_image_size(image, [10, 20]), (10, 20))
+        self.assertEqual(get_resize_output_image_size(image, (10, 20), default_to_square=True), (10, 20))
+        # To match pytorch behaviour, max_size is only relevant if size is an int
+        self.assertEqual(get_resize_output_image_size(image, (10, 20), max_size=5), (10, 20))
+
+        # Test output size = (int(size * height / width), size) if size is an int and height > width
+        image = np.random.randint(0, 256, (3, 50, 40))
+        self.assertEqual(get_resize_output_image_size(image, 20, default_to_square=False), (25, 20))
+
+        # Test output size = (size, int(size * width / height)) if size is an int and width <= height
+        image = np.random.randint(0, 256, (3, 40, 50))
+        self.assertEqual(get_resize_output_image_size(image, 20, default_to_square=False), (20, 25))
+
+        # Test size is resized if longer size > max_size
+        image = np.random.randint(0, 256, (3, 50, 40))
+        self.assertEqual(get_resize_output_image_size(image, 20, default_to_square=False, max_size=22), (22, 17))
+
+    def test_resize(self):
+        image = np.random.randint(0, 256, (3, 224, 224))
+
+        # Check the channel order is the same by default
+        resized_image = resize(image, (30, 40))
+        self.assertIsInstance(resized_image, np.ndarray)
+        self.assertEqual(resized_image.shape, (3, 30, 40))
+
+        # Check channel order is changed if specified
+        resized_image = resize(image, (30, 40), data_format="channels_last")
+        self.assertIsInstance(resized_image, np.ndarray)
+        self.assertEqual(resized_image.shape, (30, 40, 3))
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 9f18bb83c7ee7f..1fe515aec99521 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -353,6 +353,7 @@ def create_reverse_dependency_map():
     "feature_extraction_sequence_utils.py": "test_sequence_feature_extraction_common.py",
     "feature_extraction_utils.py": "test_feature_extraction_common.py",
     "file_utils.py": ["utils/test_file_utils.py", "utils/test_model_output.py"],
+    "image_transforms.py": "test_image_transforms.py",
     "utils/generic.py": ["utils/test_file_utils.py", "utils/test_model_output.py", "utils/test_generic.py"],
     "utils/hub.py": "utils/test_file_utils.py",
     "modelcard.py": "utils/test_model_card.py",

From 84fdd074ef99d6a90437bdebfc337e731be1505a Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 18:51:03 +0100
Subject: [PATCH 036/162] Add test for utils

---
 src/transformers/image_utils.py |  2 +-
 tests/utils/test_image_utils.py | 52 ++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 8214d9e6bc5734..bebc3440832059 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -75,7 +75,7 @@ def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
         return ChannelDimension.FIRST
     elif image.shape[last_dim] in (1, 3):
         return ChannelDimension.LAST
-    raise Exception("Unable to infer channel dimension format")
+    raise ValueError("Unable to infer channel dimension format")
 
 
 def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index 6c870e3341cdf9..0d9999d33aef6d 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -17,8 +17,10 @@
 
 import datasets
 import numpy as np
+import pytest
 
 from transformers import is_torch_available, is_vision_available
+from transformers.image_utils import ChannelDimension
 from transformers.testing_utils import require_torch, require_vision
 
 
@@ -29,7 +31,7 @@
     import PIL.Image
 
     from transformers import ImageFeatureExtractionMixin
-    from transformers.image_utils import load_image
+    from transformers.image_utils import get_image_size, infer_channel_dimension_format, load_image
 
 
 def get_random_image(height, width):
@@ -485,3 +487,51 @@ def test_load_img_exif_transpose(self):
             img_arr_with_exif_transpose.shape,
             (500, 333, 3),
         )
+
+
+class UtilFunctionTester(unittest.TestCase):
+    def test_get_image_size(self):
+        # Test we can infer the size and channel dimension of an image.
+        image = np.random.randint(0, 256, (32, 64, 3))
+        self.assertEqual(get_image_size(image), (32, 64))
+
+        image = np.random.randint(0, 256, (3, 32, 64))
+        self.assertEqual(get_image_size(image), (32, 64))
+
+        # Test the channel dimension can be overriden
+        image = np.random.randint(0, 256, (3, 32, 64))
+        self.assertEqual(get_image_size(image, channel_dim=ChannelDimension.LAST), (3, 32))
+
+    def test_infer_channel_dimension(self):
+        # Test we fail with invalid input
+        with pytest.raises(ValueError):
+            infer_channel_dimension_format(np.random.randint(0, 256, (10, 10)))
+
+        with pytest.raises(ValueError):
+            infer_channel_dimension_format(np.random.randint(0, 256, (10, 10, 10, 10, 10)))
+
+        # Test we fail if neither first not last dimension is of size 3 or 1
+        with pytest.raises(ValueError):
+            infer_channel_dimension_format(np.random.randint(0, 256, (10, 1, 50)))
+
+        # Test we correctly identify the channel dimension
+        image = np.random.randint(0, 256, (3, 4, 5))
+        inferred_dim = infer_channel_dimension_format(image)
+        self.assertEqual(inferred_dim, ChannelDimension.FIRST)
+
+        image = np.random.randint(0, 256, (1, 4, 5))
+        inferred_dim = infer_channel_dimension_format(image)
+        self.assertEqual(inferred_dim, ChannelDimension.FIRST)
+
+        image = np.random.randint(0, 256, (4, 5, 3))
+        inferred_dim = infer_channel_dimension_format(image)
+        self.assertEqual(inferred_dim, ChannelDimension.LAST)
+
+        image = np.random.randint(0, 256, (4, 5, 1))
+        inferred_dim = infer_channel_dimension_format(image)
+        self.assertEqual(inferred_dim, ChannelDimension.LAST)
+
+        # We can take a batched array of images and find the dimension
+        image = np.random.randint(0, 256, (1, 3, 4, 5))
+        inferred_dim = infer_channel_dimension_format(image)
+        self.assertEqual(inferred_dim, ChannelDimension.FIRST)

From 392e980d3f8a961fc89cd0bef4ec4470f3b1da52 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 2 Aug 2022 12:24:04 +0100
Subject: [PATCH 037/162] Update some docstrings

---
 src/transformers/image_processing_utils.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 7e0a0f3a7f9aa8..7fcf7ee23d12df 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -42,6 +42,7 @@
 logger = logging.get_logger(__name__)
 
 
+# TODO: Abstract out common logic between ImageProcessorMixin and FeatureExtractorMixin
 class ImageProcessorMixin(PushToHubMixin):
     """
     Image processor mixin used to provide saving/loading functionality
@@ -79,7 +80,7 @@ class of [`BaseImageProcessor`].
                   huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
                   namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
                 - a path to a *directory* containing a image processor file saved using the
-                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+                  [`~image_processing_utils.ImageProcessorMixin.save_pretrained`] method, e.g.,
                   `./my_model_directory/`.
                 - a path or url to a saved image processor JSON *file*, e.g.,
                   `./my_model_directory/preprocessor_config.json`.
@@ -131,7 +132,7 @@ class of [`BaseImageProcessor`].
     def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
         """
         Save an image_processor object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
+        [`~image_processing_utils.ImageProcessorMixin.from_pretrained`] instance method.
 
         Args:
             save_directory (`str` or `os.PathLike`):
@@ -181,7 +182,7 @@ def get_image_processor_dict(
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """
         From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
-        image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using `from_dict`.
+        image processor of type [`~image_processing_utils.ImageProcessorMixin`] using `from_dict`.
 
         Parameters:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -293,19 +294,19 @@ def get_image_processor_dict(
     @classmethod
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
-        Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of
+        Instantiates a type of [`~image_processing_utils.ImageProcessorMixin`] from a Python dictionary of
         parameters.
 
         Args:
             image_processor_dict (`Dict[str, Any]`):
                 Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
                 retrieved from a pretrained checkpoint by leveraging the
-                [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
+                [`~image_processing_utils.ImageProcessorMixin.to_dict`] method.
             kwargs (`Dict[str, Any]`):
                 Additional parameters from which to initialize the image processor object.
 
         Returns:
-            [`~feature_extraction_utils.FeatureExtractionMixin`]: The image processor object instantiated from those
+            [`~image_processing_utils.ImageProcessorMixin`]: The image processor object instantiated from those
             parameters.
         """
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
@@ -350,7 +351,7 @@ def from_json_file(cls, json_file: Union[str, os.PathLike]):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            A image processor of type [`~image_processing_utils.ImageProcessorMixin`]: The image_processor object
             instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:

From a28ac88f40a5f9a8f1d03f65156c915ce6e997ca Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 2 Aug 2022 19:15:56 +0100
Subject: [PATCH 038/162] Make sure in channels last before converting to PIL

---
 src/transformers/image_transforms.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index aed291b56365fc..6c0eecc519123d 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -198,9 +198,11 @@ def resize(
     # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
     # the pillow library to resize the image and then convert back to numpy
     if not isinstance(image, PIL.Image.Image):
+        # PIL expects image to have channels last
+        image = to_channel_dimension_format(image, ChannelDimension.LAST)
         image = to_pil_image(image)
-    # PIL images are in the format (width, height)
     h, w = size
+    # PIL images are in the format (width, height)
     resized_image = image.resize((w, h), resample=resample)
     resized_image = np.array(resized_image)
     resized_image = to_channel_dimension_format(resized_image, data_format)

From 082e4ff9631b934c7eb25ba211ed4fd351171fbb Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 2 Aug 2022 19:18:53 +0100
Subject: [PATCH 039/162] Remove default to numpy batching

---
 src/transformers/models/glpn/image_processing_glpn.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 6ab3d85221eb84..eae2607334db83 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -75,9 +75,6 @@ def preprocess(
         size_divisor = size_divisor if size_divisor is not None else self.size_divisor
         resample = resample if resample is not None else self.resample
 
-        # If a return type isn't specified, default to numpy arrays.
-        return_tensors = TensorType.NUMPY if return_tensors is None else return_tensors
-
         if do_resize and size_divisor is None:
             raise ValueError("size_divisor is required for resizing")
 

From bf7335821e3e262d2a3b56749c5442d4235dc6a1 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 3 Aug 2022 12:53:39 +0100
Subject: [PATCH 040/162] Fix up

---
 src/transformers/image_processing_utils.py            | 3 +--
 src/transformers/image_transforms.py                  | 4 ++--
 src/transformers/models/glpn/image_processing_glpn.py | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index a15b188349e2df..756a4cee7823ee 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -313,8 +313,7 @@ def get_image_processor_dict(
     @classmethod
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
-        Instantiates a type of [`~image_processing_utils.ImageProcessorMixin`] from a Python dictionary of
-        parameters.
+        Instantiates a type of [`~image_processing_utils.ImageProcessorMixin`] from a Python dictionary of parameters.
 
         Args:
             image_processor_dict (`Dict[str, Any]`):
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 871665455f3c40..99149682616bdb 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -80,8 +80,8 @@ def rescale(
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
         dtype (`np.dtype`, *optional*):
-            The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility
-            with feature extractors
+            The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature
+            extractors
 
     Returns:
         image: A rescaled np.ndarray image.
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index eae2607334db83..6c13650c2a5f48 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -91,7 +91,7 @@ def preprocess(
             images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
 
         if do_rescale:
-            images = [self.rescale(image, scale=1/255) for image in images]
+            images = [self.rescale(image, scale=1 / 255) for image in images]
 
         images = [to_channel_dimension_format(image, data_format) for image in images]
 

From 34b6b2fa64ef200192f03927997bb01bcb92122f Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Aug 2022 09:22:42 +0100
Subject: [PATCH 041/162] Add docstring and model_input_types

---
 .../models/glpn/image_processing_glpn.py      | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 6c13650c2a5f48..1cdb455b50526b 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -31,6 +31,25 @@
 
 
 class GLPNImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a GLPN image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input based on certain `size_divisor`.
+        size_divisor (`int` or `Tuple(int)`, *optional*, defaults to 32):
+            Make sure the input is divisible by this value. Only has an effect if `do_resize` is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
+            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
+            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
+            to `True`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+    """
+
+    model_input_names = ["pixel_values"]
+
     def __init__(
         self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs
     ) -> None:

From 71502937c60296a1f26ca0444f2eff9a44be0be6 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Aug 2022 09:38:14 +0100
Subject: [PATCH 042/162] Use feature processor config from hub

---
 src/transformers/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 530f456a90085a..6fc12d09b5114d 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -156,7 +156,7 @@
 FLAX_WEIGHTS_INDEX_NAME = "flax_model.msgpack.index.json"
 CONFIG_NAME = "config.json"
 FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
-IMAGE_PROCESSOR_NAME = "image_processor_config.json"
+IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
 MODEL_CARD_NAME = "modelcard.json"
 
 SENTENCEPIECE_UNDERLINE = "▁"

From b1db434a41e75d0bc9ad2f9b6e618797ab43289a Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Aug 2022 10:01:37 +0100
Subject: [PATCH 043/162] Alias GLPN feature extractor to image processor

---
 .../models/glpn/feature_extraction_glpn.py    | 123 +-----------------
 1 file changed, 4 insertions(+), 119 deletions(-)

diff --git a/src/transformers/models/glpn/feature_extraction_glpn.py b/src/transformers/models/glpn/feature_extraction_glpn.py
index 2694d56b898bec..b92385b0eb909e 100644
--- a/src/transformers/models/glpn/feature_extraction_glpn.py
+++ b/src/transformers/models/glpn/feature_extraction_glpn.py
@@ -14,126 +14,11 @@
 # limitations under the License.
 """Feature extractor class for GLPN."""
 
-from typing import Optional, Union
-
-import numpy as np
-from PIL import Image
-
-from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...image_utils import ImageFeatureExtractionMixin, ImageInput, is_torch_tensor
-from ...utils import TensorType, logging
+from .image_processing_glpn import GLPNImageProcessor
+from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-
-class GLPNFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
-    r"""
-    Constructs a GLPN feature extractor.
-
-    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
-    should refer to this superclass for more information regarding those methods.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input based on certain `size_divisor`.
-        size_divisor (`int` or `Tuple(int)`, *optional*, defaults to 32):
-            Make sure the input is divisible by this value. Only has an effect if `do_resize` is set to `True`.
-        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
-            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
-            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
-            if `do_resize` is set to `True`.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-             Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
-    """
-
-    model_input_names = ["pixel_values"]
-
-    def __init__(self, do_resize=True, size_divisor=32, resample=Image.BILINEAR, do_rescale=True, **kwargs):
-        super().__init__(**kwargs)
-        self.do_resize = do_resize
-        self.size_divisor = size_divisor
-        self.resample = resample
-        self.do_rescale = do_rescale
-
-    def _resize(self, image, size_divisor, resample):
-        if not isinstance(image, Image.Image):
-            image = self.to_pil_image(image)
-
-        width, height = image.size
-        new_h, new_w = height // size_divisor * size_divisor, width // size_divisor * size_divisor
-
-        image = self.resize(image, size=(new_w, new_h), resample=resample)
-
-        return image
-
-    def __call__(
-        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several image(s).
-
-        <Tip warning={true}>
-
-        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-        PIL images.
-
-        </Tip>
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
-              width).
-        """
-        # Input type checking for clearer error
-        valid_images = False
-
-        # Check that images has a valid type
-        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
-            valid_images = True
-        elif isinstance(images, (list, tuple)):
-            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
-                valid_images = True
-
-        if not valid_images:
-            raise ValueError(
-                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
-                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
-            )
-
-        is_batched = bool(
-            isinstance(images, (list, tuple))
-            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
-        )
-
-        if not is_batched:
-            images = [images]
-
-        # transformations (resizing + rescaling)
-        if self.do_resize and self.size_divisor is not None:
-            images = [
-                self._resize(image=image, size_divisor=self.size_divisor, resample=self.resample) for image in images
-            ]
-        if self.do_rescale:
-            images = [self.to_numpy_array(image=image) for image in images]
-
-        # return as BatchFeature
-        data = {"pixel_values": images}
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
-
-        return encoded_inputs
+# Featrue extractor for GLPN is being replace by image processor
+GLPNFeatureExtractor = GLPNImageProcessor

From f0c14ee1a279c4dc7508487b323afef001e4ec9e Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 5 Aug 2022 10:17:44 +0100
Subject: [PATCH 044/162] Alias feature extractor mixin

---
 src/transformers/image_processing_utils.py | 412 +--------------------
 1 file changed, 5 insertions(+), 407 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 7fcf7ee23d12df..62a6178d1f805c 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -13,415 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import json
-import os
-from typing import Any, Dict, Tuple, Union
-
-import numpy as np
-
-from requests import HTTPError
-
-from .dynamic_module_utils import custom_object_save
-from .utils import (
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
-    IMAGE_PROCESSOR_NAME,
-    EntryNotFoundError,
-    PushToHubMixin,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-    cached_path,
-    copy_func,
-    hf_bucket_url,
-    is_offline_mode,
-    is_remote_url,
-    logging,
-)
+from .feature_extraction_utils import FeatureExtractionMixin
+from .utils import logging
 
 
 logger = logging.get_logger(__name__)
 
 
-# TODO: Abstract out common logic between ImageProcessorMixin and FeatureExtractorMixin
-class ImageProcessorMixin(PushToHubMixin):
-    """
-    Image processor mixin used to provide saving/loading functionality
-    """
-
-    _auto_class = None
-
-    def __init__(self, **kwargs):
-        """Set elements of `kwargs` as attributes."""
-        # Pop "processor_class" as it should be saved as private attribute
-        self._processor_class = kwargs.pop("processor_class", None)
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error(f"Can't set {key} with value {value} for {self}")
-                raise err
-
-    def _set_processor_class(self, processor_class: str):
-        """Sets processor class as an attribute."""
-        self._processor_class = processor_class
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
-        r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived
-        class of [`BaseImageProcessor`].
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                This can be either:
-
-                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
-                - a path to a *directory* containing a image processor file saved using the
-                  [`~image_processing_utils.ImageProcessorMixin.save_pretrained`] method, e.g.,
-                  `./my_model_directory/`.
-                - a path or url to a saved image processor JSON *file*, e.g.,
-                  `./my_model_directory/preprocessor_config.json`.
-            cache_dir (`str` or `os.PathLike`, *optional*):
-                Path to a directory in which a downloaded pretrained model image processor should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions if
-                they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
-                exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                If `False`, then this function returns just the final image processor object. If `True`, then this
-                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
-                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
-                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
-            kwargs (`Dict[str, Any]`, *optional*):
-                The values in kwargs of any keys which are image processor attributes will be used to override the
-                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
-                controlled by the `return_unused_kwargs` keyword parameter.
-
-        <Tip>
-
-        Passing `use_auth_token=True` is required when you want to use a private model.
-
-        </Tip>
-
-        Returns:
-            An image processor of type [`~image_processing_utils.ImageProcessorMixin`].
-
-        Examples: FIXME
-
-        """
-        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
-
-        return cls.from_dict(image_processor_dict, **kwargs)
-
-    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save an image_processor object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~image_processing_utils.ImageProcessorMixin.from_pretrained`] instance method.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the image processor JSON file will be saved (will be created if it does not exist).
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your image processor to the Hugging Face model hub after saving it.
-
-                <Tip warning={true}>
-
-                Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
-                which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
-                folder. Pass along `temp_dir=True` to use a temporary directory instead.
-
-                </Tip>
-
-            kwargs:
-                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-
-        if push_to_hub:
-            commit_message = kwargs.pop("commit_message", None)
-            repo = self._create_or_get_repo(save_directory, **kwargs)
-
-        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
-        # loaded from the Hub.
-        if self._auto_class is not None:
-            custom_object_save(self, save_directory, config=self)
-
-        os.makedirs(save_directory, exist_ok=True)
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
-
-        self.to_json_file(output_image_processor_file)
-        logger.info(f"Image processor saved in {output_image_processor_file}")
-
-        if push_to_hub:
-            url = self._push_to_hub(repo, commit_message=commit_message)
-            logger.info(f"Image processor pushed to the hub in this commit: {url}")
-
-        return [output_image_processor_file]
-
-    @classmethod
-    def get_image_processor_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        """
-        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
-        image processor of type [`~image_processing_utils.ImageProcessorMixin`] using `from_dict`.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
-
-        Returns:
-            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
-        """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-
-        from_pipeline = kwargs.pop("_from_pipeline", None)
-        from_auto_class = kwargs.pop("_from_auto", False)
-
-        user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
-        if from_pipeline is not None:
-            user_agent["using_pipeline"] = from_pipeline
-
-        if is_offline_mode() and not local_files_only:
-            logger.info("Offline mode: forcing local_files_only=True")
-            local_files_only = True
-
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if os.path.isdir(pretrained_model_name_or_path):
-            image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            image_processor_file = pretrained_model_name_or_path
-        else:
-            image_processor_file = hf_bucket_url(
-                pretrained_model_name_or_path, filename=IMAGE_PROCESSOR_NAME, revision=revision, mirror=None
-            )
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_image_processor_file = cached_path(
-                image_processor_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                user_agent=user_agent,
-            )
-
-        except RepositoryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed on "
-                "'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token having "
-                "permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass "
-                "`use_auth_token=True`."
-            )
-        except RevisionNotFoundError:
-            raise EnvironmentError(
-                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
-                f"model name. Check the model page at 'https://huggingface.co/{pretrained_model_name_or_path}' for "
-                "available revisions."
-            )
-        except EntryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} does not appear to have a file named {IMAGE_PROCESSOR_NAME}."
-            )
-        except HTTPError as err:
-            raise EnvironmentError(
-                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
-            )
-        except ValueError:
-            raise EnvironmentError(
-                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it in"
-                f" the cached files and it looks like {pretrained_model_name_or_path} is not the path to a directory"
-                f" containing a {IMAGE_PROCESSOR_NAME} file.\nCheckout your internet connection or see how to run"
-                " the library in offline mode at"
-                " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-            )
-        except EnvironmentError:
-            raise EnvironmentError(
-                f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load it "
-                "from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                f"containing a {IMAGE_PROCESSOR_NAME} file"
-            )
-
-        try:
-            # Load image_processor dict
-            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
-                text = reader.read()
-            image_processor_dict = json.loads(text)
-
-        except json.JSONDecodeError:
-            raise EnvironmentError(
-                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
-            )
-
-        if resolved_image_processor_file == image_processor_file:
-            logger.info(f"loading image processor configuration file {image_processor_file}")
-        else:
-            logger.info(
-                f"loading image processor configuration file {image_processor_file} from cache at"
-                f" {resolved_image_processor_file}"
-            )
-
-        return image_processor_dict, kwargs
-
-    @classmethod
-    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
-        """
-        Instantiates a type of [`~image_processing_utils.ImageProcessorMixin`] from a Python dictionary of
-        parameters.
-
-        Args:
-            image_processor_dict (`Dict[str, Any]`):
-                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
-                retrieved from a pretrained checkpoint by leveraging the
-                [`~image_processing_utils.ImageProcessorMixin.to_dict`] method.
-            kwargs (`Dict[str, Any]`):
-                Additional parameters from which to initialize the image processor object.
-
-        Returns:
-            [`~image_processing_utils.ImageProcessorMixin`]: The image processor object instantiated from those
-            parameters.
-        """
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        image_processor = cls(**image_processor_dict)
-
-        # Update image_processor with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(image_processor, key):
-                setattr(image_processor, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info(f"image processor {image_processor}")
-        if return_unused_kwargs:
-            return image_processor, kwargs
-        else:
-            return image_processor
-
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary.
-
-        Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["image_processor_type"] = self.__class__.__name__
-
-        return output
-
-    @classmethod
-    def from_json_file(cls, json_file: Union[str, os.PathLike]):
-        """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
-        file of parameters.
-
-        Args:
-            json_file (`str` or `os.PathLike`):
-                Path to the JSON file containing the parameters.
-
-        Returns:
-            A image processor of type [`~image_processing_utils.ImageProcessorMixin`]: The image_processor object
-            instantiated from that JSON file.
-        """
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        image_processor_dict = json.loads(text)
-        return cls(**image_processor_dict)
-
-    def to_json_string(self) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-        Returns:
-            `str`: String containing all the attributes that make up this image_processor instance in JSON format.
-        """
-        dictionary = self.to_dict()
-
-        for key, value in dictionary.items():
-            if isinstance(value, np.ndarray):
-                dictionary[key] = value.tolist()
-
-        # make sure private name "_processor_class" is correctly
-        # saved as "processor_class"
-        _processor_class = dictionary.pop("_processor_class", None)
-        if _processor_class is not None:
-            dictionary["processor_class"] = _processor_class
-
-        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
-        """
-        Save this instance to a JSON file.
-
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file in which this image_processor instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    @classmethod
-    def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
-        """
-        Register this class with a given auto class. This should only be used for custom image processors as the ones
-        in the library are already mapped with `AutoImageProcessor`.
-
-        <Tip warning={true}>
-
-        This API is experimental and may have some slight breaking changes in the next releases.
-
-        </Tip>
-
-        Args:
-            auto_class (`str` or `type`, *optional*, defaults to `"AutoFeatureExtractor"`):
-                The auto class to register this new image processor with.
-        """
-        if not isinstance(auto_class, str):
-            auto_class = auto_class.__name__
-
-        import transformers.models.auto as auto_module
-
-        if not hasattr(auto_module, auto_class):
-            raise ValueError(f"{auto_class} is not a valid auto class.")
-
-        cls._auto_class = auto_class
-
-
-ImageProcessorMixin.push_to_hub = copy_func(ImageProcessorMixin.push_to_hub)
-ImageProcessorMixin.push_to_hub.__doc__ = ImageProcessorMixin.push_to_hub.__doc__.format(
-    object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
-)
+# We use aliasing whilst we phase out the old API. Once feature extractors for vision models
+# are deprecated, ImageProcessor mixin will be implemented. Any shared logic will be abstracted out.
+ImageProcessorMixin = FeatureExtractionMixin

From affb94509117dfc2b1aba786452ff83a3c4638a5 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Sun, 7 Aug 2022 18:58:04 +0100
Subject: [PATCH 045/162] Add return_numpy=False flag for resize

---
 src/transformers/image_transforms.py | 10 ++++++++--
 tests/test_image_transforms.py       |  6 ++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 6c0eecc519123d..1561b1c4662b87 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -171,6 +171,7 @@ def resize(
     size: Tuple[int, int],
     resample=PIL.Image.Resampling.BILINEAR,
     data_format: Optional[ChannelDimension] = None,
+    return_numpy: bool = True,
 ) -> np.ndarray:
     """
     Resizes `image` to (h, w) specified by `size` using the PIL library.
@@ -184,6 +185,9 @@ def resize(
             The filter to user for resampling.
         data_format (`ChannelDimension`, *optional*, defaults to `None`):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
+        return_numpy (`bool`, *optional*, defaults to `True`):
+            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is
+            returned.
 
     Returns:
         image: A resized np.ndarray.
@@ -204,6 +208,8 @@ def resize(
     h, w = size
     # PIL images are in the format (width, height)
     resized_image = image.resize((w, h), resample=resample)
-    resized_image = np.array(resized_image)
-    resized_image = to_channel_dimension_format(resized_image, data_format)
+
+    if return_numpy:
+        resized_image = np.array(resized_image)
+        resized_image = to_channel_dimension_format(resized_image, data_format)
     return resized_image
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 16bdcae09f6ffd..2c978636a91f46 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -126,3 +126,9 @@ def test_resize(self):
         resized_image = resize(image, (30, 40), data_format="channels_last")
         self.assertIsInstance(resized_image, np.ndarray)
         self.assertEqual(resized_image.shape, (30, 40, 3))
+
+        # Check PIL.Image.Image is return if return_numpy=False
+        resized_image = resize(image, (30, 40), return_numpy=False)
+        self.assertIsInstance(resized_image, PIL.Image.Image)
+        # PIL size is in (width, height) order
+        self.assertEqual(resized_image.size, (40, 30))

From 7a4d22a880dd7ae176138f7e829e875e2be6d7b2 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 8 Aug 2022 12:31:38 +0100
Subject: [PATCH 046/162] Fix up

---
 src/transformers/image_processing_utils.py | 3 ++-
 src/transformers/image_transforms.py       | 7 ++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 3b8d380d53c478..74ed9c31c39716 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .feature_extraction_utils import FeatureExtractionMixin, BatchFeature as BaseBatchFeature
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .feature_extraction_utils import FeatureExtractionMixin
 from .utils import logging
 
 
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 73fa4e5f6e8d44..d09ef526084a75 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -18,7 +18,7 @@
 import numpy as np
 import PIL
 
-from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available
+from transformers.utils.import_utils import is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
@@ -35,8 +35,6 @@
         import torch
     if is_tf_available():
         import tensorflow as tf
-    if is_flax_available():
-        import jax.numpy as jnp
 
 
 def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDimension, str]) -> np.ndarray:
@@ -188,8 +186,7 @@ def resize(
         data_format (`ChannelDimension`, *optional*, defaults to `None`):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
         return_numpy (`bool`, *optional*, defaults to `True`):
-            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is
-            returned.
+            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned.
 
     Returns:
         image: A resized np.ndarray.

From 994e040254d50848024356a3f051e3cdebe821fd Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 8 Aug 2022 12:35:05 +0100
Subject: [PATCH 047/162] Fix up

---
 src/transformers/image_transforms.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 1561b1c4662b87..ffd4f671e2952d 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -186,8 +186,7 @@ def resize(
         data_format (`ChannelDimension`, *optional*, defaults to `None`):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
         return_numpy (`bool`, *optional*, defaults to `True`):
-            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is
-            returned.
+            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned.
 
     Returns:
         image: A resized np.ndarray.

From 42c23bda0b066f1be36b00aee85bb681c8efe09c Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 8 Aug 2022 12:55:01 +0100
Subject: [PATCH 048/162] Use different frameworks safely

---
 tests/test_image_transforms.py | 68 +++++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 14 deletions(-)

diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 2c978636a91f46..6811962dafb8cf 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from parameterized import parameterized
-from transformers.testing_utils import require_vision
+from transformers.testing_utils import require_flax, require_tf, require_torch, require_vision
 from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available
 
 
@@ -29,7 +29,7 @@
     import tensorflow as tf
 
 if is_flax_available():
-    import jax.numpy as jnp
+    import jax
 
 if is_vision_available():
     import PIL.Image
@@ -52,21 +52,61 @@ def get_random_image(height, width, num_channels=3, channels_first=True):
 class ImageTransformsTester(unittest.TestCase):
     @parameterized.expand(
         [
-            ("numpy_float_channels_first", True, lambda x: x, (3, 4, 5), np.float32),
-            ("numpy_float_channels_last", True, lambda x: x, (4, 5, 3), np.float32),
-            ("numpy_int_channels_first", True, lambda x: x, (3, 4, 5), np.int32),
-            ("numpy_uint_channels_first", True, lambda x: x, (3, 4, 5), np.uint8),
-            ("tensorflow", is_tf_available, tf.convert_to_tensor, (3, 4, 5), np.float32),
-            ("torch", is_torch_available, torch.tensor, (3, 4, 5), np.float32),
-            ("jax", is_flax_available, jnp.array, (3, 4, 5), np.float32),
+            ("numpy_float_channels_first", (3, 4, 5), np.float32),
+            ("numpy_float_channels_last", (4, 5, 3), np.float32),
+            ("numpy_int_channels_first", (3, 4, 5), np.int32),
+            ("numpy_uint_channels_first", (3, 4, 5), np.uint8),
         ]
     )
-    def test_to_pil_image(self, name, is_library_available, to_tensor, image_shape, dtype):
+    @require_vision
+    def test_to_pil_image(self, name, image_shape, dtype):
         image = np.random.randint(0, 256, image_shape).astype(dtype)
-        if is_library_available:
-            image = to_tensor(image)
-            pil_image = to_pil_image(image)
-            self.assertIsInstance(pil_image, PIL.Image.Image)
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (image_shape[2], image_shape[1]))
+
+    @require_tf
+    def test_to_pil_image_from_tensorflow(self):
+        # channels_first
+        image = tf.random.uniform((3, 4, 5))
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (5, 4))
+
+        # channels_last
+        image = tf.random.uniform((4, 5, 3))
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (5, 4))
+
+    @require_torch
+    def test_to_pil_image_from_torch(self):
+        # channels first
+        image = torch.rand((3, 4, 5))
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (5, 4))
+
+        # channels last
+        image = torch.rand((4, 5, 3))
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (5, 4))
+
+    @require_flax
+    def test_to_pil_image_from_jax(self):
+        key = jax.random.PRNGKey(0)
+        # channel first
+        image = jax.random.uniform(key, (3, 4, 5))
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (5, 4))
+
+        # channel last
+        image = jax.random.uniform(key, (4, 5, 3))
+        pil_image = to_pil_image(image)
+        self.assertIsInstance(pil_image, PIL.Image.Image)
+        self.assertEqual(pil_image.size, (5, 4))
 
     def test_to_channel_dimension_format(self):
         # Test that function doesn't reorder if channel dim matches the input.

From 05c65f68f16d9391cc8a90de994f79b2ea533894 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 8 Aug 2022 13:04:52 +0100
Subject: [PATCH 049/162] Safely import PIL

---
 src/transformers/image_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 875e74444d3618..3384607247f8e9 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -17,12 +17,10 @@
 from typing import List, Tuple, Union
 
 import numpy as np
-import PIL.Image
-import PIL.ImageOps
 
 import requests
 
-from .utils import is_flax_available, is_tf_available, is_torch_available
+from .utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available
 from .utils.constants import (  # noqa: F401
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -32,6 +30,11 @@
 from .utils.generic import ExplicitEnum, _is_jax, _is_tensorflow, _is_torch
 
 
+if is_vision_available:
+    import PIL.Image
+    import PIL.ImageOps
+
+
 ImageInput = Union[
     PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
 ]

From feb9556e1b2a4ab7ec067c2ff961b9c555689776 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 8 Aug 2022 13:20:19 +0100
Subject: [PATCH 050/162] Call function checking if PIL available

---
 src/transformers/image_utils.py | 4 ++--
 tests/test_image_transforms.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 3384607247f8e9..2aa57cb1b6bfea 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -30,13 +30,13 @@
 from .utils.generic import ExplicitEnum, _is_jax, _is_tensorflow, _is_torch
 
 
-if is_vision_available:
+if is_vision_available():
     import PIL.Image
     import PIL.ImageOps
 
 
 ImageInput = Union[
-    PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
+    "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]  # noqa
 ]
 
 
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 6811962dafb8cf..69e6de1587b8d6 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -63,7 +63,7 @@ def test_to_pil_image(self, name, image_shape, dtype):
         image = np.random.randint(0, 256, image_shape).astype(dtype)
         pil_image = to_pil_image(image)
         self.assertIsInstance(pil_image, PIL.Image.Image)
-        self.assertEqual(pil_image.size, (image_shape[2], image_shape[1]))
+        self.assertEqual(pil_image.size, (5, 4))
 
     @require_tf
     def test_to_pil_image_from_tensorflow(self):

From a30b007d4646ca36d6f853c0cc4a9437f2e304df Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 8 Aug 2022 14:29:28 +0100
Subject: [PATCH 051/162] Only import if vision available

---
 src/transformers/image_transforms.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index ffd4f671e2952d..b485947faf5e2c 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -16,18 +16,20 @@
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import numpy as np
-import PIL
-
-from transformers.utils.import_utils import is_tf_available, is_torch_available
-
-from .image_utils import (
-    ChannelDimension,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_jax_tensor,
-    is_tf_tensor,
-    is_torch_tensor,
-)
+
+from transformers.utils.import_utils import is_tf_available, is_torch_available, is_vision_available
+
+if is_vision_available():
+    import PIL
+
+    from .image_utils import (
+        ChannelDimension,
+        get_image_size,
+        infer_channel_dimension_format,
+        is_jax_tensor,
+        is_tf_tensor,
+        is_torch_tensor,
+    )
 
 
 if TYPE_CHECKING:

From fd7b6c7bb32f98a73693d73b6792d13537f36bd2 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 9 Aug 2022 18:15:47 +0100
Subject: [PATCH 052/162] Address Sylvain PR comments Co-authored-by:
 Sylvain.gugger@gmail.com

---
 src/transformers/image_transforms.py | 4 ++--
 src/transformers/image_utils.py      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index b485947faf5e2c..3c825d2f10cc8f 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -206,9 +206,9 @@ def resize(
         # PIL expects image to have channels last
         image = to_channel_dimension_format(image, ChannelDimension.LAST)
         image = to_pil_image(image)
-    h, w = size
+    height, width = size
     # PIL images are in the format (width, height)
-    resized_image = image.resize((w, h), resample=resample)
+    resized_image = image.resize((width, height), resample=resample)
 
     if return_numpy:
         resized_image = np.array(resized_image)
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 2aa57cb1b6bfea..59a7702519534d 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -73,7 +73,7 @@ def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
     elif image.ndim == 4:
         first_dim, last_dim = 1, 3
     else:
-        raise ValueError("Unsupported number of image dimensions: {}".format(image.ndim))
+        raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
 
     if image.shape[first_dim] in (1, 3):
         return ChannelDimension.FIRST

From 790c2c6d997ae263f0813ac0e499edfc140c8cb4 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 10 Aug 2022 15:41:47 +0100
Subject: [PATCH 053/162] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <Sylvain.gugger@gmail.com>

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/image_transforms.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 315a6634fa3eb3..558b253addcd08 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -73,13 +73,13 @@ def rescale(
     Rescales `image` by `scale`.
 
     Args:
-        image (`np.ndarray``):
+        image (`np.ndarray`):
             The image to rescale.
-        scale (`float`, `int`):
+        scale (`float` or `int`, *optional*, defaults to 255):
             The scale to use for rescaling the image.
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
-        dtype (`np.dtype`, *optional*):
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
             The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature
             extractors
 

From 2e929cfdba42dedf29aaa8c98b1c94e6f3c0a566 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 12 Aug 2022 11:06:01 +0100
Subject: [PATCH 054/162] Update src/transformers/image_transforms.py

Co-authored-by: Alara Dirik <8944735+alaradirik@users.noreply.github.com>
---
 src/transformers/image_transforms.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 558b253addcd08..6b2ed5522b7ebf 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -80,8 +80,7 @@ def rescale(
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
         dtype (`np.dtype`, *optional*, defaults to `np.float32`):
-            The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature
-            extractors
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature extractors.
 
     Returns:
         image: A rescaled np.ndarray image.

From ff04de34932d2e085993f20828316b3c99ba75be Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 12 Aug 2022 16:50:22 +0100
Subject: [PATCH 055/162] Update
 src/transformers/models/glpn/feature_extraction_glpn.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/glpn/feature_extraction_glpn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glpn/feature_extraction_glpn.py b/src/transformers/models/glpn/feature_extraction_glpn.py
index b92385b0eb909e..9bcd4d1b9d6c6d 100644
--- a/src/transformers/models/glpn/feature_extraction_glpn.py
+++ b/src/transformers/models/glpn/feature_extraction_glpn.py
@@ -20,5 +20,5 @@
 
 logger = logging.get_logger(__name__)
 
-# Featrue extractor for GLPN is being replace by image processor
+# Feature extractor for GLPN is being replaced by image processor
 GLPNFeatureExtractor = GLPNImageProcessor

From ae358735685986862d410a269780cc7986aee86d Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 17 Aug 2022 13:08:48 +0100
Subject: [PATCH 056/162] Add in docstrings

---
 src/transformers/image_processing_utils.py    |  3 +-
 src/transformers/image_transforms.py          |  6 +-
 .../models/glpn/image_processing_glpn.py      | 86 ++++++++++++++++---
 3 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index c81600511a0a48..721fc86f0ec52f 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .feature_extraction_utils import FeatureExtractionMixin, BatchFeature as BaseBatchFeature
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .feature_extraction_utils import FeatureExtractionMixin
 from .utils import logging
 
 
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 6b2ed5522b7ebf..05112d0118f8b4 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -80,7 +80,8 @@ def rescale(
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
         dtype (`np.dtype`, *optional*, defaults to `np.float32`):
-            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature extractors.
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
+            extractors.
 
     Returns:
         image: A rescaled np.ndarray image.
@@ -214,8 +215,7 @@ def resize(
         data_format (`ChannelDimension`, *optional*, defaults to `None`):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
         return_numpy (`bool`, *optional*, defaults to `True`):
-            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is
-            returned.
+            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned.
 
     Returns:
         image: A resized np.ndarray.
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 1cdb455b50526b..15605781d11a17 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Image processor class for GLPN."""
 
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 import numpy as np
 import PIL.Image
@@ -36,16 +36,16 @@ class GLPNImageProcessor(BaseImageProcessor):
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input based on certain `size_divisor`.
-        size_divisor (`int` or `Tuple(int)`, *optional*, defaults to 32):
-            Make sure the input is divisible by this value. Only has an effect if `do_resize` is set to `True`.
-        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
-            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
-            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
-            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
-            to `True`.
+            Set the class default for the `do_resize` parameter. Controls whether to resize the image's (height, width)
+            dimensions, rounding them down to the closest multiple of `size_divisor`.
         do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+            Set the class default for the `do_rescale` parameter. Controls whether or not to apply the scaling factor
+            (to make pixel values floats between 0. and 1.).
+        size_divisor (`int`, *optional*, defaults to 32):
+            Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so
+            their height and width are rounded down to the closest multiple of `size_divisor`.
+        resample (`PIL.Image.Resampling`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+            Set the class default for `resample`. Defines the resampling filter to use if resizing the image.
     """
 
     model_input_names = ["pixel_values"]
@@ -62,12 +62,32 @@ def __init__(
     def resize(
         self,
         image: np.ndarray,
-        size_divisor: Union[int, float],
+        size_divisor: int,
         resample: PIL.Image.Resampling,
         data_format: Optional[ChannelDimension] = None,
         **kwargs
     ) -> np.ndarray:
+        """
+        Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor.
+
+        If the image is of dimension (3, 260, 170) and size_divisor is 32, the image will be resized to (3, 256, 160).
+
+        Args:
+            image (`np.ndarray`):
+                The image to resize.
+            size_divisor (`int`):
+                The image is resized so its height and width are rounded down to the closest multiple of
+                `size_divisor`.
+            resample (`PIL.Image.Resampling`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If `None`, the channel dimension format of the input
+                image is used. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
         height, width = get_image_size(image)
+        # Rounds the height and width down to the closest multiple of size_divisor
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor
         image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
@@ -76,11 +96,25 @@ def resize(
     def rescale(
         self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs
     ) -> np.ndarray:
+        """
+        Rescale the image by the given scaling factor `scale`.
+
+        Args:
+            image (`np.ndarray`):
+                The image to rescale.
+            scale (`int` or `float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If `None`, the channel dimension format of the input
+                image is used. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
         return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
-        images,
+        images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
         do_resize: bool = None,
         do_rescale: bool = None,
         size_divisor: int = None,
@@ -89,6 +123,34 @@ def preprocess(
         data_format: ChannelDimension = ChannelDimension.FIRST,
         **kwargs
     ) -> BatchFeature:
+        """
+        Preprocess the given images.
+
+        Args:
+            images (`PIL.Image.Image` or `TensorType` or `List[np.ndarray]` or `List[TensorType]`):
+                The image or images to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+            size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
+                When `do_resize` is `True`, images are resized so their height and width are rounded down to the
+                closest multiple of `size_divisor`.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image.Resampling`,
+                Only has an effect if `do_resize` is set to `True`.
+            return_tensors (`str`, *optional*, defaults to `None`):
+                The type of tensors to return. Can be one of:
+                    - `None`: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         size_divisor = size_divisor if size_divisor is not None else self.size_divisor

From b7edea0367874b73a6775b217d89c6c4b21c674e Mon Sep 17 00:00:00 2001
From: Seunghwan Hong <harrydrippin@gmail.com>
Date: Fri, 5 Aug 2022 20:39:40 +0900
Subject: [PATCH 057/162] Fix TFSwinSelfAttention to have relative position
 index as non-trainable weight (#18226)

Signed-off-by: Seunghwan Hong <seunghwan@scatterlab.co.kr>
---
 .../models/swin/modeling_tf_swin.py           | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index c781710bdd0ff5..dc0e7131628b89 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -461,21 +461,6 @@ def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> No
             window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
         )
 
-        # get pair-wise relative position index for each token inside the window
-        coords_h = tf.range(self.window_size[0])
-        coords_w = tf.range(self.window_size[1])
-        coords = tf.stack(tf.meshgrid(coords_h, coords_w, indexing="ij"))
-        coords_flatten = tf.reshape(coords, (shape_list(coords)[0], -1))
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
-        relative_coords = tf.transpose(relative_coords, (1, 2, 0))
-
-        stack_0, stack_1 = tf.unstack(relative_coords, axis=2)
-        stack_0 += self.window_size[0] - 1
-        stack_0 *= 2 * self.window_size[1] - 1
-        stack_1 += self.window_size[1] - 1
-        relative_coords = tf.stack([stack_0, stack_1], axis=2)
-        self.relative_position_index = tf.reduce_sum(relative_coords, axis=-1)
-
         self.query = tf.keras.layers.Dense(
             self.all_head_size,
             kernel_initializer=get_initializer(config.initializer_range),
@@ -503,6 +488,28 @@ def build(self, input_shape: tf.TensorShape) -> None:
             initializer="zeros",
             name="relative_position_bias_table",
         )
+        self.relative_position_index = self.add_weight(
+            shape=(self.window_size[0] ** 2, self.window_size[1] ** 2),
+            trainable=False,
+            dtype=tf.int32,
+            name="relative_position_index",
+        )
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = tf.range(self.window_size[0])
+        coords_w = tf.range(self.window_size[1])
+        coords = tf.stack(tf.meshgrid(coords_h, coords_w, indexing="ij"))
+        coords_flatten = tf.reshape(coords, (shape_list(coords)[0], -1))
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = tf.transpose(relative_coords, (1, 2, 0))
+
+        stack_0, stack_1 = tf.unstack(relative_coords, axis=2)
+        stack_0 += self.window_size[0] - 1
+        stack_0 *= 2 * self.window_size[1] - 1
+        stack_1 += self.window_size[1] - 1
+        relative_coords = tf.stack([stack_0, stack_1], axis=2)
+
+        self.relative_position_index.assign(tf.cast(tf.reduce_sum(relative_coords, axis=-1), tf.int32))
         super().build(input_shape)
 
     def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:

From 8cacf30888ae059bead694a21012c2e87ef62405 Mon Sep 17 00:00:00 2001
From: Seunghwan Hong <harrydrippin@gmail.com>
Date: Fri, 5 Aug 2022 20:40:14 +0900
Subject: [PATCH 058/162] Refactor `TFSwinLayer` to increase serving
 compatibility (#18352)

* Refactor `TFSwinLayer` to increase serving compatibility

Signed-off-by: Seunghwan Hong <seunghwan@scatterlab.co.kr>

* Fix missed parameters while refactoring

Signed-off-by: Seunghwan Hong <seunghwan@scatterlab.co.kr>

* Fix window_reverse to calculate batch size

Signed-off-by: Seunghwan Hong <harrydrippin@gmail.com>
Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/swin/modeling_tf_swin.py            | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index dc0e7131628b89..2f9bd27b0e0006 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -226,9 +226,9 @@ def window_reverse(windows: tf.Tensor, window_size: int, height: int, width: int
     """
     Merges windows to produce higher resolution features.
     """
-    x = shape_list(windows)[0]
+    x = tf.shape(windows)[0]
     y = tf.cast(height * width / (window_size * window_size), tf.int32)
-    batch_size = int(x / y)
+    batch_size = tf.math.floordiv(x, y)
     windows = tf.reshape(
         windows, (batch_size, height // window_size, width // window_size, window_size, window_size, -1)
     )
@@ -695,16 +695,18 @@ def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: i
         img_mask = tf.expand_dims(img_mask, -1)
         img_mask = tf.expand_dims(img_mask, 0)
 
-        mask_windows = window_partition(img_mask, self.window_size)
-        mask_windows = tf.reshape(mask_windows, (-1, self.window_size * self.window_size))
+        mask_windows = window_partition(img_mask, window_size)
+        mask_windows = tf.reshape(mask_windows, (-1, window_size * window_size))
         attn_mask = tf.expand_dims(mask_windows, 1) - tf.expand_dims(mask_windows, 2)
         attn_mask = tf.where(attn_mask != 0, float(-100.0), attn_mask)
         attn_mask = tf.where(attn_mask == 0, float(0.0), attn_mask)
         return attn_mask
 
-    def maybe_pad(self, hidden_states: tf.Tensor, height: int, width: int) -> Tuple[tf.Tensor, tf.Tensor]:
-        pad_right = (self.window_size - width % self.window_size) % self.window_size
-        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+    def maybe_pad(
+        self, hidden_states: tf.Tensor, window_size: int, height: int, width: int
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        pad_right = (window_size - width % window_size) % window_size
+        pad_bottom = (window_size - height % window_size) % window_size
         pad_values = [[0, 0], [0, pad_bottom], [0, pad_right], [0, 0]]
         hidden_states = tf.pad(hidden_states, pad_values)
         pad_values = tf.reshape(pad_values, (-1,))
@@ -730,7 +732,7 @@ def call(
         hidden_states = self.layernorm_before(hidden_states, training=training)
         hidden_states = tf.reshape(hidden_states, (batch_size, height, width, channels))
         # pad hidden_states to multiples of window size
-        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+        hidden_states, pad_values = self.maybe_pad(hidden_states, window_size, height, width)
 
         _, height_pad, width_pad, _ = shape_list(hidden_states)
         # cyclic shift

From e385c5a9e8c19b7c58bb29ab5127c1673624dda8 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 5 Aug 2022 13:59:55 +0200
Subject: [PATCH 059/162] Add TF prefix to TF-Res test class (#18481)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/resnet/test_modeling_tf_resnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/resnet/test_modeling_tf_resnet.py b/tests/models/resnet/test_modeling_tf_resnet.py
index 5f4eead8661cbf..1056ebc8eeac4f 100644
--- a/tests/models/resnet/test_modeling_tf_resnet.py
+++ b/tests/models/resnet/test_modeling_tf_resnet.py
@@ -41,7 +41,7 @@
     from transformers import AutoFeatureExtractor
 
 
-class ResNetModelTester:
+class TFResNetModelTester:
     def __init__(
         self,
         parent,
@@ -116,7 +116,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class ResNetModelTest(TFModelTesterMixin, unittest.TestCase):
+class TFResNetModelTest(TFModelTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as ResNet does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
@@ -131,7 +131,7 @@ class ResNetModelTest(TFModelTesterMixin, unittest.TestCase):
     has_attentions = False
 
     def setUp(self):
-        self.model_tester = ResNetModelTester(self)
+        self.model_tester = TFResNetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=ResNetConfig, has_text_modality=False)
 
     def test_config(self):
@@ -223,7 +223,7 @@ def prepare_img():
 
 @require_tf
 @require_vision
-class ResNetModelIntegrationTest(unittest.TestCase):
+class TFResNetModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (

From ed4b0595ca4a27c3c8b3f39e4ae2d9b0695f8ea9 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 5 Aug 2022 09:12:19 -0400
Subject: [PATCH 060/162] Remove py.typed (#18485)

---
 setup.py                  | 1 -
 src/transformers/py.typed | 1 -
 2 files changed, 2 deletions(-)
 delete mode 100644 src/transformers/py.typed

diff --git a/setup.py b/setup.py
index 9c37822ac98fdb..52b7837a88456d 100644
--- a/setup.py
+++ b/setup.py
@@ -412,7 +412,6 @@ def run(self):
     url="https://github.com/huggingface/transformers",
     package_dir={"": "src"},
     packages=find_packages("src"),
-    package_data={"transformers": ["py.typed"]},
     zip_safe=False,
     extras_require=extras,
     entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
diff --git a/src/transformers/py.typed b/src/transformers/py.typed
deleted file mode 100644
index 8b137891791fe9..00000000000000
--- a/src/transformers/py.typed
+++ /dev/null
@@ -1 +0,0 @@
-

From 553be89075b86c5e61ab0612e578981b0026d1f2 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 5 Aug 2022 09:14:51 -0400
Subject: [PATCH 061/162] Fix pipeline tests (#18487)

* Fix pipeline tests

* Make sure all pipelines tests run with init changes
---
 src/transformers/pipelines/__init__.py   | 1 +
 tests/pipelines/test_pipelines_common.py | 4 ++--
 utils/tests_fetcher.py                   | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 7a022e5635e674..d2a4b663801d78 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -606,6 +606,7 @@ def pipeline(
 
     # Retrieve the task
     if task in custom_tasks:
+        normalized_task = task
         targeted_task, task_options = clean_custom_task(custom_tasks[task])
         if pipeline_class is None:
             if not trust_remote_code:
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 83474a5ba048e2..5d5c8fa2333eb6 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -795,7 +795,7 @@ def test_warning_logs(self):
         alias = "text-classification"
         # Get the original task, so we can restore it at the end.
         # (otherwise the subsequential tests in `TextClassificationPipelineTests` will fail)
-        original_task, original_task_options = PIPELINE_REGISTRY.check_task(alias)
+        _, original_task, _ = PIPELINE_REGISTRY.check_task(alias)
 
         try:
             with CaptureLogger(logger_) as cm:
@@ -816,7 +816,7 @@ def test_register_pipeline(self):
         )
         assert "custom-text-classification" in PIPELINE_REGISTRY.get_supported_tasks()
 
-        task_def, _ = PIPELINE_REGISTRY.check_task("custom-text-classification")
+        _, task_def, _ = PIPELINE_REGISTRY.check_task("custom-text-classification")
         self.assertEqual(task_def["pt"], (AutoModelForSequenceClassification,) if is_torch_available() else ())
         self.assertEqual(task_def["tf"], (TFAutoModelForSequenceClassification,) if is_tf_available() else ())
         self.assertEqual(task_def["type"], "text")
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 1fe515aec99521..e8c171f2187901 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -378,6 +378,7 @@ def create_reverse_dependency_map():
     ],
     "optimization.py": "optimization/test_optimization.py",
     "optimization_tf.py": "optimization/test_optimization_tf.py",
+    "pipelines/__init__.py": "pipelines/test_pipelines_*.py",
     "pipelines/base.py": "pipelines/test_pipelines_*.py",
     "pipelines/text2text_generation.py": [
         "pipelines/test_pipelines_text2text_generation.py",

From cfa16ebed6d266b63bb609f6504b4237ba51cc2a Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 5 Aug 2022 10:12:40 -0400
Subject: [PATCH 062/162] Use new huggingface_hub tools for download models
 (#18438)

* Draft new cached_file

* Initial draft for config and model

* Small fixes

* Fix first batch of tests

* Look in cache when internet is down

* Fix last tests

* Bad black, not fixing all quality errors

* Make diff less

* Implement change for TF and Flax models

* Add tokenizer and feature extractor

* For compatibility with main

* Add utils to move the cache and auto-do it at first use.

* Quality

* Deal with empty commit shas

* Deal with empty etag

* Address review comments
---
 src/transformers/configuration_utils.py      | 118 ++---
 src/transformers/feature_extraction_utils.py | 103 ++---
 src/transformers/modeling_flax_utils.py      | 139 +++---
 src/transformers/modeling_tf_utils.py        | 139 +++---
 src/transformers/modeling_utils.py           | 136 ++----
 src/transformers/tokenization_utils_base.py  |  94 +---
 src/transformers/utils/__init__.py           |   2 +
 src/transformers/utils/hub.py                | 456 +++++++++++++++++--
 tests/test_configuration_common.py           |   4 +-
 tests/test_feature_extraction_common.py      |   4 +-
 tests/test_modeling_common.py                |   4 +-
 tests/test_modeling_tf_common.py             |   4 +-
 tests/test_tokenization_common.py            |   4 +-
 13 files changed, 662 insertions(+), 545 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index fe2d6b3aaef637..b10475127b4fce 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -25,25 +25,9 @@
 
 from packaging import version
 
-from requests import HTTPError
-
 from . import __version__
 from .dynamic_module_utils import custom_object_save
-from .utils import (
-    CONFIG_NAME,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
-    EntryNotFoundError,
-    PushToHubMixin,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-    cached_path,
-    copy_func,
-    hf_bucket_url,
-    is_offline_mode,
-    is_remote_url,
-    is_torch_available,
-    logging,
-)
+from .utils import CONFIG_NAME, PushToHubMixin, cached_file, copy_func, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -591,77 +575,43 @@ def _get_config_dict(
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
 
-        if is_offline_mode() and not local_files_only:
-            logger.info("Offline mode: forcing local_files_only=True")
-            local_files_only = True
-
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)) or is_remote_url(
-            pretrained_model_name_or_path
-        ):
-            config_file = pretrained_model_name_or_path
+
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
+            # Soecial case when pretrained_model_name_or_path is a local file
+            resolved_config_file = pretrained_model_name_or_path
+            is_local = True
         else:
             configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME)
 
-            if os.path.isdir(os.path.join(pretrained_model_name_or_path, subfolder)):
-                config_file = os.path.join(pretrained_model_name_or_path, subfolder, configuration_file)
-            else:
-                config_file = hf_bucket_url(
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_config_file = cached_file(
                     pretrained_model_name_or_path,
-                    filename=configuration_file,
+                    configuration_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    user_agent=user_agent,
                     revision=revision,
-                    subfolder=subfolder if len(subfolder) > 0 else None,
-                    mirror=None,
+                    subfolder=subfolder,
+                )
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load the configuration of '{pretrained_model_name_or_path}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the same"
+                    f" name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory"
+                    f" containing a {configuration_file} file"
                 )
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_config_file = cached_path(
-                config_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                user_agent=user_agent,
-            )
-
-        except RepositoryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed on "
-                "'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token having "
-                "permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass "
-                "`use_auth_token=True`."
-            )
-        except RevisionNotFoundError:
-            raise EnvironmentError(
-                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
-                f"model name. Check the model page at 'https://huggingface.co/{pretrained_model_name_or_path}' for "
-                "available revisions."
-            )
-        except EntryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} does not appear to have a file named {configuration_file}."
-            )
-        except HTTPError as err:
-            raise EnvironmentError(
-                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
-            )
-        except ValueError:
-            raise EnvironmentError(
-                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it in"
-                f" the cached files and it looks like {pretrained_model_name_or_path} is not the path to a directory"
-                f" containing a {configuration_file} file.\nCheckout your internet connection or see how to run the"
-                " library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-            )
-        except EnvironmentError:
-            raise EnvironmentError(
-                f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                f"containing a {configuration_file} file"
-            )
 
         try:
             # Load config dict
@@ -671,10 +621,10 @@ def _get_config_dict(
                 f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file."
             )
 
-        if resolved_config_file == config_file:
-            logger.info(f"loading configuration file {config_file}")
+        if is_local:
+            logger.info(f"loading configuration file {resolved_config_file}")
         else:
-            logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}")
+            logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
 
         return config_dict, kwargs
 
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index b411d744284665..ec68f355191c1d 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -24,23 +24,15 @@
 
 import numpy as np
 
-from requests import HTTPError
-
 from .dynamic_module_utils import custom_object_save
 from .utils import (
     FEATURE_EXTRACTOR_NAME,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
-    EntryNotFoundError,
     PushToHubMixin,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
     TensorType,
-    cached_path,
+    cached_file,
     copy_func,
-    hf_bucket_url,
     is_flax_available,
     is_offline_mode,
-    is_remote_url,
     is_tf_available,
     is_torch_available,
     logging,
@@ -388,64 +380,40 @@ def get_feature_extractor_dict(
             local_files_only = True
 
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
         if os.path.isdir(pretrained_model_name_or_path):
             feature_extractor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            feature_extractor_file = pretrained_model_name_or_path
+        if os.path.isfile(pretrained_model_name_or_path):
+            resolved_feature_extractor_file = pretrained_model_name_or_path
+            is_local = True
         else:
-            feature_extractor_file = hf_bucket_url(
-                pretrained_model_name_or_path, filename=FEATURE_EXTRACTOR_NAME, revision=revision, mirror=None
-            )
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_feature_extractor_file = cached_path(
-                feature_extractor_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                user_agent=user_agent,
-            )
-
-        except RepositoryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed on "
-                "'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token having "
-                "permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass "
-                "`use_auth_token=True`."
-            )
-        except RevisionNotFoundError:
-            raise EnvironmentError(
-                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
-                f"model name. Check the model page at 'https://huggingface.co/{pretrained_model_name_or_path}' for "
-                "available revisions."
-            )
-        except EntryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} does not appear to have a file named {FEATURE_EXTRACTOR_NAME}."
-            )
-        except HTTPError as err:
-            raise EnvironmentError(
-                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
-            )
-        except ValueError:
-            raise EnvironmentError(
-                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it in"
-                f" the cached files and it looks like {pretrained_model_name_or_path} is not the path to a directory"
-                f" containing a {FEATURE_EXTRACTOR_NAME} file.\nCheckout your internet connection or see how to run"
-                " the library in offline mode at"
-                " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-            )
-        except EnvironmentError:
-            raise EnvironmentError(
-                f"Can't load feature extractor for '{pretrained_model_name_or_path}'. If you were trying to load it "
-                "from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                f"containing a {FEATURE_EXTRACTOR_NAME} file"
-            )
+            feature_extractor_file = FEATURE_EXTRACTOR_NAME
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_feature_extractor_file = cached_file(
+                    pretrained_model_name_or_path,
+                    feature_extractor_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    user_agent=user_agent,
+                    revision=revision,
+                )
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load feature extractor for '{pretrained_model_name_or_path}'. If you were trying to load"
+                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a {FEATURE_EXTRACTOR_NAME} file"
+                )
 
         try:
             # Load feature_extractor dict
@@ -458,12 +426,11 @@ def get_feature_extractor_dict(
                 f"It looks like the config file at '{resolved_feature_extractor_file}' is not a valid JSON file."
             )
 
-        if resolved_feature_extractor_file == feature_extractor_file:
-            logger.info(f"loading feature extractor configuration file {feature_extractor_file}")
+        if is_local:
+            logger.info(f"loading configuration file {resolved_feature_extractor_file}")
         else:
             logger.info(
-                f"loading feature extractor configuration file {feature_extractor_file} from cache at"
-                f" {resolved_feature_extractor_file}"
+                f"loading configuration file {feature_extractor_file} from cache at {resolved_feature_extractor_file}"
             )
 
         return feature_extractor_dict, kwargs
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 0dcb3bc959e83d..af75b418cad23e 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -32,7 +32,6 @@
 from flax.serialization import from_bytes, to_bytes
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax.random import PRNGKey
-from requests import HTTPError
 
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
@@ -41,20 +40,14 @@
 from .utils import (
     FLAX_WEIGHTS_INDEX_NAME,
     FLAX_WEIGHTS_NAME,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     WEIGHTS_NAME,
-    EntryNotFoundError,
     PushToHubMixin,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
     add_code_sample_docstrings,
     add_start_docstrings_to_model_forward,
-    cached_path,
+    cached_file,
     copy_func,
     has_file,
-    hf_bucket_url,
     is_offline_mode,
-    is_remote_url,
     logging,
     replace_return_docstrings,
 )
@@ -557,6 +550,9 @@ def from_pretrained(
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -598,6 +594,7 @@ def from_pretrained(
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         _do_init = kwargs.pop("_do_init", True)
+        subfolder = kwargs.pop("subfolder", "")
 
         if trust_remote_code is True:
             logger.warning(
@@ -642,6 +639,8 @@ def from_pretrained(
 
         # Load model
         if pretrained_model_name_or_path is not None:
+            pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+            is_local = os.path.isdir(pretrained_model_name_or_path)
             if os.path.isdir(pretrained_model_name_or_path):
                 if from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
                     # Load from a PyTorch checkpoint
@@ -665,65 +664,44 @@ def from_pretrained(
                         f"Error no file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
                         f"{pretrained_model_name_or_path}."
                     )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
+                is_local = True
             else:
                 filename = WEIGHTS_NAME if from_pt else FLAX_WEIGHTS_NAME
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path,
-                    filename=filename,
-                    revision=revision,
-                )
-
-            # redirect to the cache, if necessary
+                try:
+                    # Load from URL or cache if already cached
+                    cached_file_kwargs = dict(
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        user_agent=user_agent,
+                        revision=revision,
+                        subfolder=subfolder,
+                        _raise_exceptions_for_missing_entries=False,
+                    )
+                    resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
-            try:
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                )
-
-            except RepositoryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
-                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
-                    "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
-                    "login` and pass `use_auth_token=True`."
-                )
-            except RevisionNotFoundError:
-                raise EnvironmentError(
-                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
-                    "this model name. Check the model page at "
-                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-                )
-            except EntryNotFoundError:
-                if filename == FLAX_WEIGHTS_NAME:
-                    try:
+                    # Since we set _raise_exceptions_for_missing_entries=False, we don't get an expection but a None
+                    # result when internet is up, the repo and revision exist, but the file does not.
+                    if resolved_archive_file is None and filename == FLAX_WEIGHTS_NAME:
                         # Maybe the checkpoint is sharded, we try to grab the index name in this case.
-                        archive_file = hf_bucket_url(
-                            pretrained_model_name_or_path,
-                            filename=FLAX_WEIGHTS_INDEX_NAME,
-                            revision=revision,
-                        )
-                        resolved_archive_file = cached_path(
-                            archive_file,
-                            cache_dir=cache_dir,
-                            force_download=force_download,
-                            proxies=proxies,
-                            resume_download=resume_download,
-                            local_files_only=local_files_only,
-                            use_auth_token=use_auth_token,
-                            user_agent=user_agent,
+                        resolved_archive_file = cached_file(
+                            pretrained_model_name_or_path, FLAX_WEIGHTS_INDEX_NAME, **cached_file_kwargs
                         )
-                        is_sharded = True
-                    except EntryNotFoundError:
-                        has_file_kwargs = {"revision": revision, "proxies": proxies, "use_auth_token": use_auth_token}
+                        if resolved_archive_file is not None:
+                            is_sharded = True
+                    if resolved_archive_file is None:
+                        # Otherwise, maybe there is a TF or Flax model file.  We try those to give a helpful error
+                        # message.
+                        has_file_kwargs = {
+                            "revision": revision,
+                            "proxies": proxies,
+                            "use_auth_token": use_auth_token,
+                        }
                         if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
                                 f"{pretrained_model_name_or_path} does not appear to have a file named"
@@ -735,35 +713,24 @@ def from_pretrained(
                                 f"{pretrained_model_name_or_path} does not appear to have a file named"
                                 f" {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}."
                             )
-                else:
+                except EnvironmentError:
+                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                    # to the original exception.
+                    raise
+                except Exception:
+                    # For any other exception, we throw a generic error.
                     raise EnvironmentError(
-                        f"{pretrained_model_name_or_path} does not appear to have a file named {filename}."
+                        f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
+                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                        f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                        f" directory containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}."
                     )
-            except HTTPError as err:
-                raise EnvironmentError(
-                    f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
-                    f"{err}"
-                )
-            except ValueError:
-                raise EnvironmentError(
-                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
-                    f" directory containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}.\nCheckout your"
-                    " internet connection or see how to run the library in offline mode at"
-                    " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-                )
-            except EnvironmentError:
-                raise EnvironmentError(
-                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                    f"containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}."
-                )
-
-            if resolved_archive_file == archive_file:
+
+            if is_local:
                 logger.info(f"loading weights file {archive_file}")
+                resolved_archive_file = archive_file
             else:
-                logger.info(f"loading weights file {archive_file} from cache at {resolved_archive_file}")
+                logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}")
         else:
             resolved_archive_file = None
 
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index e1d8f5b7957be5..1a63d32e4196a0 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -37,7 +37,6 @@
 
 from huggingface_hub import Repository, list_repo_files
 from keras.saving.hdf5_format import save_attributes_to_hdf5_group
-from requests import HTTPError
 from transformers.utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
 
 from . import DataCollatorWithPadding, DefaultDataCollator
@@ -48,22 +47,16 @@
 from .tf_utils import shape_list
 from .utils import (
     DUMMY_INPUTS,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     TF2_WEIGHTS_INDEX_NAME,
     TF2_WEIGHTS_NAME,
     WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
-    EntryNotFoundError,
     ModelOutput,
     PushToHubMixin,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-    cached_path,
+    cached_file,
     find_labels,
     has_file,
-    hf_bucket_url,
     is_offline_mode,
-    is_remote_url,
     logging,
     requires_backends,
     working_or_temp_dir,
@@ -2112,6 +2105,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                 problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                 Please refer to the mirror site for more information.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -2164,6 +2160,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         load_weight_prefix = kwargs.pop("load_weight_prefix", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
+        subfolder = kwargs.pop("subfolder", "")
 
         if trust_remote_code is True:
             logger.warning(
@@ -2202,9 +2199,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
         # index of the files.
         is_sharded = False
-        sharded_metadata = None
         # Load model
         if pretrained_model_name_or_path is not None:
+            pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+            is_local = os.path.isdir(pretrained_model_name_or_path)
             if os.path.isdir(pretrained_model_name_or_path):
                 if from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
                     # Load from a PyTorch checkpoint in priority if from_pt
@@ -2232,68 +2230,43 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         f"Error no file named {TF2_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
                         f"{pretrained_model_name_or_path}."
                     )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
+                is_local = True
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                 archive_file = pretrained_model_name_or_path + ".index"
+                is_local = True
             else:
+                # set correct filename
                 filename = WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path,
-                    filename=filename,
-                    revision=revision,
-                    mirror=mirror,
-                )
 
-            try:
-                # Load from URL or cache if already cached
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                )
+                try:
+                    # Load from URL or cache if already cached
+                    cached_file_kwargs = dict(
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        user_agent=user_agent,
+                        revision=revision,
+                        subfolder=subfolder,
+                        _raise_exceptions_for_missing_entries=False,
+                    )
+                    resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
-            except RepositoryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
-                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
-                    "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
-                    "login` and pass `use_auth_token=True`."
-                )
-            except RevisionNotFoundError:
-                raise EnvironmentError(
-                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
-                    "this model name. Check the model page at "
-                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-                )
-            except EntryNotFoundError:
-                if filename == TF2_WEIGHTS_NAME:
-                    try:
+                    # Since we set _raise_exceptions_for_missing_entries=False, we don't get an expection but a None
+                    # result when internet is up, the repo and revision exist, but the file does not.
+                    if resolved_archive_file is None and filename == TF2_WEIGHTS_NAME:
                         # Maybe the checkpoint is sharded, we try to grab the index name in this case.
-                        archive_file = hf_bucket_url(
-                            pretrained_model_name_or_path,
-                            filename=TF2_WEIGHTS_INDEX_NAME,
-                            revision=revision,
-                            mirror=mirror,
-                        )
-                        resolved_archive_file = cached_path(
-                            archive_file,
-                            cache_dir=cache_dir,
-                            force_download=force_download,
-                            proxies=proxies,
-                            resume_download=resume_download,
-                            local_files_only=local_files_only,
-                            use_auth_token=use_auth_token,
-                            user_agent=user_agent,
+                        resolved_archive_file = cached_file(
+                            pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME, **cached_file_kwargs
                         )
-                        is_sharded = True
-                    except EntryNotFoundError:
-                        # Otherwise, maybe there is a TF or Flax model file.  We try those to give a helpful error
+                        if resolved_archive_file is not None:
+                            is_sharded = True
+                    if resolved_archive_file is None:
+                        # Otherwise, maybe there is a PyTorch or Flax model file.  We try those to give a helpful error
                         # message.
                         has_file_kwargs = {
                             "revision": revision,
@@ -2312,42 +2285,32 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                                 f"{pretrained_model_name_or_path} does not appear to have a file named"
                                 f" {TF2_WEIGHTS_NAME} or {WEIGHTS_NAME}."
                             )
-                else:
+
+                except EnvironmentError:
+                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                    # to the original exception.
+                    raise
+                except Exception:
+                    # For any other exception, we throw a generic error.
+
                     raise EnvironmentError(
-                        f"{pretrained_model_name_or_path} does not appear to have a file named {filename}."
+                        f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
+                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                        f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                        f" directory containing a file named {TF2_WEIGHTS_NAME} or {WEIGHTS_NAME}."
                     )
-            except HTTPError as err:
-                raise EnvironmentError(
-                    f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
-                    f"{err}"
-                )
-            except ValueError:
-                raise EnvironmentError(
-                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
-                    f" directory containing a file named {TF2_WEIGHTS_NAME} or {WEIGHTS_NAME}.\nCheckout your internet"
-                    " connection or see how to run the library in offline mode at"
-                    " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-                )
-            except EnvironmentError:
-                raise EnvironmentError(
-                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                    f"containing a file named {TF2_WEIGHTS_NAME} or {WEIGHTS_NAME}."
-                )
-
-            if resolved_archive_file == archive_file:
+            if is_local:
                 logger.info(f"loading weights file {archive_file}")
+                resolved_archive_file = archive_file
             else:
-                logger.info(f"loading weights file {archive_file} from cache at {resolved_archive_file}")
+                logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}")
         else:
             resolved_archive_file = None
 
         # We'll need to download and cache each checkpoint shard if the checkpoint is sharded.
         if is_sharded:
             # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
-            resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+            resolved_archive_file, _ = get_checkpoint_shard_files(
                 pretrained_model_name_or_path,
                 resolved_archive_file,
                 cache_dir=cache_dir,
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 5cd458d1f9d587..8709ec66365c66 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -31,7 +31,6 @@
 from torch import Tensor, device, nn
 from torch.nn import CrossEntropyLoss
 
-from requests import HTTPError
 from transformers.utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
 from transformers.utils.import_utils import is_sagemaker_mp_enabled
 
@@ -51,24 +50,18 @@
 from .utils import (
     DUMMY_INPUTS,
     FLAX_WEIGHTS_NAME,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     TF2_WEIGHTS_NAME,
     TF_WEIGHTS_NAME,
     WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
     ContextManagers,
-    EntryNotFoundError,
     ModelOutput,
     PushToHubMixin,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-    cached_path,
+    cached_file,
     copy_func,
     has_file,
-    hf_bucket_url,
     is_accelerate_available,
     is_offline_mode,
-    is_remote_url,
     logging,
     replace_return_docstrings,
 )
@@ -1868,7 +1861,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         if pretrained_model_name_or_path is not None:
             pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-            if os.path.isdir(pretrained_model_name_or_path):
+            is_local = os.path.isdir(pretrained_model_name_or_path)
+            if is_local:
                 if from_tf and os.path.isfile(
                     os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
                 ):
@@ -1911,10 +1905,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         f"Error no file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or "
                         f"{FLAX_WEIGHTS_NAME} found in directory {pretrained_model_name_or_path}."
                     )
-            elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)) or is_remote_url(
-                pretrained_model_name_or_path
-            ):
+            elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
                 archive_file = pretrained_model_name_or_path
+                is_local = True
             elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path + ".index")):
                 if not from_tf:
                     raise ValueError(
@@ -1922,6 +1915,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         "from_tf to True to load from this checkpoint."
                     )
                 archive_file = os.path.join(subfolder, pretrained_model_name_or_path + ".index")
+                is_local = True
             else:
                 # set correct filename
                 if from_tf:
@@ -1931,63 +1925,32 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 else:
                     filename = WEIGHTS_NAME
 
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path,
-                    filename=filename,
-                    revision=revision,
-                    mirror=mirror,
-                    subfolder=subfolder if len(subfolder) > 0 else None,
-                )
-
-            try:
-                # Load from URL or cache if already cached
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                )
+                try:
+                    # Load from URL or cache if already cached
+                    cached_file_kwargs = dict(
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        user_agent=user_agent,
+                        revision=revision,
+                        subfolder=subfolder,
+                        _raise_exceptions_for_missing_entries=False,
+                    )
+                    resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
-            except RepositoryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
-                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
-                    "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
-                    "login` and pass `use_auth_token=True`."
-                )
-            except RevisionNotFoundError:
-                raise EnvironmentError(
-                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
-                    "this model name. Check the model page at "
-                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-                )
-            except EntryNotFoundError:
-                if filename == WEIGHTS_NAME:
-                    try:
+                    # Since we set _raise_exceptions_for_missing_entries=False, we don't get an expection but a None
+                    # result when internet is up, the repo and revision exist, but the file does not.
+                    if resolved_archive_file is None and filename == WEIGHTS_NAME:
                         # Maybe the checkpoint is sharded, we try to grab the index name in this case.
-                        archive_file = hf_bucket_url(
-                            pretrained_model_name_or_path,
-                            filename=WEIGHTS_INDEX_NAME,
-                            revision=revision,
-                            mirror=mirror,
-                            subfolder=subfolder if len(subfolder) > 0 else None,
-                        )
-                        resolved_archive_file = cached_path(
-                            archive_file,
-                            cache_dir=cache_dir,
-                            force_download=force_download,
-                            proxies=proxies,
-                            resume_download=resume_download,
-                            local_files_only=local_files_only,
-                            use_auth_token=use_auth_token,
-                            user_agent=user_agent,
+                        resolved_archive_file = cached_file(
+                            pretrained_model_name_or_path, WEIGHTS_INDEX_NAME, **cached_file_kwargs
                         )
-                        is_sharded = True
-                    except EntryNotFoundError:
+                        if resolved_archive_file is not None:
+                            is_sharded = True
+                    if resolved_archive_file is None:
                         # Otherwise, maybe there is a TF or Flax model file.  We try those to give a helpful error
                         # message.
                         has_file_kwargs = {
@@ -2013,42 +1976,31 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                                 f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME},"
                                 f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
                             )
-                else:
+                except EnvironmentError:
+                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                    # to the original exception.
+                    raise
+                except Exception:
+                    # For any other exception, we throw a generic error.
                     raise EnvironmentError(
-                        f"{pretrained_model_name_or_path} does not appear to have a file named {filename}."
+                        f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
+                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                        f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                        f" directory containing a file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or"
+                        f" {FLAX_WEIGHTS_NAME}."
                     )
-            except HTTPError as err:
-                raise EnvironmentError(
-                    f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
-                    f"{err}"
-                )
-            except ValueError:
-                raise EnvironmentError(
-                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
-                    f" directory containing a file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or"
-                    f" {FLAX_WEIGHTS_NAME}.\nCheckout your internet connection or see how to run the library in"
-                    " offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-                )
-            except EnvironmentError:
-                raise EnvironmentError(
-                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                    f"containing a file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or "
-                    f"{FLAX_WEIGHTS_NAME}."
-                )
 
-            if resolved_archive_file == archive_file:
+            if is_local:
                 logger.info(f"loading weights file {archive_file}")
+                resolved_archive_file = archive_file
             else:
-                logger.info(f"loading weights file {archive_file} from cache at {resolved_archive_file}")
+                logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}")
         else:
             resolved_archive_file = None
 
         # We'll need to download and cache each checkpoint shard if the checkpoint is sharded.
         if is_sharded:
-            # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
+            # rsolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
             resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
                 pretrained_model_name_or_path,
                 resolved_archive_file,
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 8d24baf05bdb86..fc1c0ff8da3b32 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -35,21 +35,16 @@
 from . import __version__
 from .dynamic_module_utils import custom_object_save
 from .utils import (
-    EntryNotFoundError,
     ExplicitEnum,
     PaddingStrategy,
     PushToHubMixin,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
     TensorType,
     add_end_docstrings,
-    cached_path,
+    cached_file,
     copy_func,
     get_file_from_repo,
-    hf_bucket_url,
     is_flax_available,
     is_offline_mode,
-    is_remote_url,
     is_tf_available,
     is_tokenizers_available,
     is_torch_available,
@@ -1669,7 +1664,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         vocab_files = {}
         init_configuration = {}
 
-        if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isfile(pretrained_model_name_or_path):
             if len(cls.vocab_files_names) > 1:
                 raise ValueError(
                     f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
@@ -1689,9 +1685,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                 "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                 "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
             }
-            vocab_files_target = {**cls.vocab_files_names, **additional_files_names}
+            vocab_files = {**cls.vocab_files_names, **additional_files_names}
 
-            if "tokenizer_file" in vocab_files_target:
+            if "tokenizer_file" in vocab_files:
                 # Try to get the tokenizer config to see if there are versioned tokenizer files.
                 fast_tokenizer_file = FULL_TOKENIZER_FILE
                 resolved_config_file = get_file_from_repo(
@@ -1704,80 +1700,38 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     use_auth_token=use_auth_token,
                     revision=revision,
                     local_files_only=local_files_only,
+                    subfolder=subfolder,
                 )
                 if resolved_config_file is not None:
                     with open(resolved_config_file, encoding="utf-8") as reader:
                         tokenizer_config = json.load(reader)
                         if "fast_tokenizer_files" in tokenizer_config:
                             fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
-                vocab_files_target["tokenizer_file"] = fast_tokenizer_file
-
-            # Look for the tokenizer files
-            for file_id, file_name in vocab_files_target.items():
-                if os.path.isdir(pretrained_model_name_or_path):
-                    if subfolder is not None:
-                        full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
-                    else:
-                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-                    if not os.path.exists(full_file_name):
-                        logger.info(f"Didn't find file {full_file_name}. We won't load it.")
-                        full_file_name = None
-                else:
-                    full_file_name = hf_bucket_url(
-                        pretrained_model_name_or_path,
-                        filename=file_name,
-                        subfolder=subfolder,
-                        revision=revision,
-                        mirror=None,
-                    )
-
-                vocab_files[file_id] = full_file_name
+                vocab_files["tokenizer_file"] = fast_tokenizer_file
 
         # Get files from url, cache, or disk depending on the case
         resolved_vocab_files = {}
         unresolved_files = []
         for file_id, file_path in vocab_files.items():
+            print(file_id, file_path)
             if file_path is None:
                 resolved_vocab_files[file_id] = None
             else:
-                try:
-                    resolved_vocab_files[file_id] = cached_path(
-                        file_path,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        resume_download=resume_download,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        user_agent=user_agent,
-                    )
-
-                except FileNotFoundError as error:
-                    if local_files_only:
-                        unresolved_files.append(file_id)
-                    else:
-                        raise error
-
-                except RepositoryNotFoundError:
-                    raise EnvironmentError(
-                        f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
-                        "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
-                        "pass a token having permission to this repo with `use_auth_token` or log in with "
-                        "`huggingface-cli login` and pass `use_auth_token=True`."
-                    )
-                except RevisionNotFoundError:
-                    raise EnvironmentError(
-                        f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
-                        "for this model name. Check the model page at "
-                        f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-                    )
-                except EntryNotFoundError:
-                    logger.debug(f"{pretrained_model_name_or_path} does not contain a file named {file_path}.")
-                    resolved_vocab_files[file_id] = None
-
-                except ValueError:
-                    logger.debug(f"Connection problem to access {file_path} and it wasn't found in the cache.")
-                    resolved_vocab_files[file_id] = None
+                resolved_vocab_files[file_id] = cached_file(
+                    pretrained_model_name_or_path,
+                    file_path,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                )
 
         if len(unresolved_files) > 0:
             logger.info(
@@ -1797,7 +1751,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             if file_id not in resolved_vocab_files:
                 continue
 
-            if file_path == resolved_vocab_files[file_id]:
+            if is_local:
                 logger.info(f"loading file {file_path}")
             else:
                 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index af8fc6a068012e..b810be45636e08 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -60,6 +60,7 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+    cached_file,
     cached_path,
     default_cache_path,
     define_sagemaker_information,
@@ -76,6 +77,7 @@
     is_local_clone,
     is_offline_mode,
     is_remote_url,
+    move_cache,
     send_example_telemetry,
     url_to_filename,
 )
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 1fd22d7a7cb70f..9e81654cda7e1a 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -19,11 +19,13 @@
 import io
 import json
 import os
+import re
 import shutil
 import subprocess
 import sys
 import tarfile
 import tempfile
+import traceback
 import warnings
 from contextlib import contextmanager
 from functools import partial
@@ -34,9 +36,20 @@
 from uuid import uuid4
 from zipfile import ZipFile, is_zipfile
 
+import huggingface_hub
 import requests
 from filelock import FileLock
-from huggingface_hub import CommitOperationAdd, HfFolder, create_commit, create_repo, list_repo_files, whoami
+from huggingface_hub import (
+    CommitOperationAdd,
+    HfFolder,
+    create_commit,
+    create_repo,
+    hf_hub_download,
+    list_repo_files,
+    whoami,
+)
+from huggingface_hub.constants import HUGGINGFACE_HEADER_X_LINKED_ETAG, HUGGINGFACE_HEADER_X_REPO_COMMIT
+from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
 from requests.exceptions import HTTPError
 from requests.models import Response
 from transformers.utils.logging import tqdm
@@ -385,21 +398,6 @@ def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
     return ua
 
 
-class RepositoryNotFoundError(HTTPError):
-    """
-    Raised when trying to access a hf.co URL with an invalid repository name, or with a private repo name the user does
-    not have access to.
-    """
-
-
-class EntryNotFoundError(HTTPError):
-    """Raised when trying to access a hf.co URL with a valid repository and revision but an invalid filename."""
-
-
-class RevisionNotFoundError(HTTPError):
-    """Raised when trying to access a hf.co URL with a valid repository but an invalid revision."""
-
-
 def _raise_for_status(response: Response):
     """
     Internal version of `request.raise_for_status()` that will refine a potential HTTPError.
@@ -628,8 +626,58 @@ def _resumable_file_manager() -> "io.BufferedWriter":
     return cache_path
 
 
-def get_file_from_repo(
-    path_or_repo: Union[str, os.PathLike],
+def try_to_load_from_cache(cache_dir, repo_id, filename, revision=None):
+    """
+    Explores the cache to return the latest cached file for a given revision.
+    """
+    if revision is None:
+        revision = "main"
+
+    model_id = repo_id.replace("/", "--")
+    model_cache = os.path.join(cache_dir, f"models--{model_id}")
+    if not os.path.isdir(model_cache):
+        # No cache for this model
+        return None
+
+    # Resolve refs (for instance to convert main to the associated commit sha)
+    cached_refs = os.listdir(os.path.join(model_cache, "refs"))
+    if revision in cached_refs:
+        with open(os.path.join(model_cache, "refs", revision)) as f:
+            revision = f.read()
+
+    cached_shas = os.listdir(os.path.join(model_cache, "snapshots"))
+    if revision not in cached_shas:
+        # No cache for this revision and we won't try to return a random revision
+        return None
+
+    cached_file = os.path.join(model_cache, "snapshots", revision, filename)
+    return cached_file if os.path.isfile(cached_file) else None
+
+
+# If huggingface_hub changes the class of error for this to FileNotFoundError, we will be able to avoid that in the
+# future.
+LOCAL_FILES_ONLY_HF_ERROR = (
+    "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable hf.co "
+    "look-ups and downloads online, set 'local_files_only' to False."
+)
+
+
+# In the future, this ugly contextmanager can be removed when huggingface_hub as a released version where we can
+# activate/deactivate progress bars.
+@contextmanager
+def _patch_hf_hub_tqdm():
+    """
+    A context manager to make huggingface hub use the tqdm version of Transformers (which is controlled by some utils)
+    in logging.
+    """
+    old_tqdm = huggingface_hub.file_download.tqdm
+    huggingface_hub.file_download.tqdm = tqdm
+    yield
+    huggingface_hub.file_download.tqdm = old_tqdm
+
+
+def cached_file(
+    path_or_repo_id: Union[str, os.PathLike],
     filename: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
@@ -638,12 +686,16 @@ def get_file_from_repo(
     use_auth_token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
+    subfolder: str = "",
+    user_agent: Optional[Union[str, Dict[str, str]]] = None,
+    _raise_exceptions_for_missing_entries=True,
+    _raise_exceptions_for_connection_errors=True,
 ):
     """
     Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
 
     Args:
-        path_or_repo (`str` or `os.PathLike`):
+        path_or_repo_id (`str` or `os.PathLike`):
             This can be either:
 
             - a string, the *model id* of a model repo on huggingface.co.
@@ -670,6 +722,9 @@ def get_file_from_repo(
             identifier allowed by git.
         local_files_only (`bool`, *optional*, defaults to `False`):
             If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
 
     <Tip>
 
@@ -678,43 +733,56 @@ def get_file_from_repo(
     </Tip>
 
     Returns:
-        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo) or `None` if the
-        file does not exist.
+        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo).
 
     Examples:
 
     ```python
-    # Download a tokenizer configuration from huggingface.co and cache.
-    tokenizer_config = get_file_from_repo("bert-base-uncased", "tokenizer_config.json")
-    # This model does not have a tokenizer config so the result will be None.
-    tokenizer_config = get_file_from_repo("xlm-roberta-base", "tokenizer_config.json")
+    # Download a model weight from the Hub and cache it.
+    model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
     ```"""
     if is_offline_mode() and not local_files_only:
         logger.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
+    if subfolder is None:
+        subfolder = ""
+
+    path_or_repo_id = str(path_or_repo_id)
+    full_filename = os.path.join(subfolder, filename)
+    if os.path.isdir(path_or_repo_id):
+        resolved_file = os.path.join(os.path.join(path_or_repo_id, subfolder), filename)
+        if not os.path.isfile(resolved_file):
+            if _raise_exceptions_for_missing_entries:
+                raise EnvironmentError(f"Could not locate {full_filename} inside {path_or_repo_id}.")
+            else:
+                return None
+        return resolved_file
 
-    path_or_repo = str(path_or_repo)
-    if os.path.isdir(path_or_repo):
-        resolved_file = os.path.join(path_or_repo, filename)
-        return resolved_file if os.path.isfile(resolved_file) else None
-    else:
-        resolved_file = hf_bucket_url(path_or_repo, filename=filename, revision=revision, mirror=None)
-
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    user_agent = http_user_agent(user_agent)
     try:
         # Load from URL or cache if already cached
-        resolved_file = cached_path(
-            resolved_file,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-        )
+        with _patch_hf_hub_tqdm():
+            resolved_file = hf_hub_download(
+                path_or_repo_id,
+                filename,
+                subfolder=None if len(subfolder) == 0 else subfolder,
+                revision=revision,
+                cache_dir=cache_dir,
+                user_agent=user_agent,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                use_auth_token=use_auth_token,
+                local_files_only=local_files_only,
+            )
 
     except RepositoryNotFoundError:
         raise EnvironmentError(
-            f"{path_or_repo} is not a local folder and is not a valid model identifier "
+            f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
             "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
             "pass a token having permission to this repo with `use_auth_token` or log in with "
             "`huggingface-cli login` and pass `use_auth_token=True`."
@@ -723,15 +791,129 @@ def get_file_from_repo(
         raise EnvironmentError(
             f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
             "for this model name. Check the model page at "
-            f"'https://huggingface.co/{path_or_repo}' for available revisions."
+            f"'https://huggingface.co/{path_or_repo_id}' for available revisions."
+        )
+    except EntryNotFoundError:
+        if not _raise_exceptions_for_missing_entries:
+            return None
+        if revision is None:
+            revision = "main"
+        raise EnvironmentError(
+            f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
+            f"'https://huggingface.co/{path_or_repo_id}/{revision}' for available files."
+        )
+    except HTTPError as err:
+        # First we try to see if we have a cached version (not up to date):
+        resolved_file = try_to_load_from_cache(cache_dir, path_or_repo_id, full_filename, revision=revision)
+        if resolved_file is not None:
+            return resolved_file
+        if not _raise_exceptions_for_connection_errors:
+            return None
+
+        raise EnvironmentError(f"There was a specific connection error when trying to load {path_or_repo_id}:\n{err}")
+    except ValueError as err:
+        # HuggingFace Hub returns a ValueError for a missing file when local_files_only=True we need to catch it here
+        # This could be caught above along in `EntryNotFoundError` if hf_hub sent a different error message here
+        if LOCAL_FILES_ONLY_HF_ERROR in err.args[0] and local_files_only and not _raise_exceptions_for_missing_entries:
+            return None
+
+        # Otherwise we try to see if we have a cached version (not up to date):
+        resolved_file = try_to_load_from_cache(cache_dir, path_or_repo_id, full_filename, revision=revision)
+        if resolved_file is not None:
+            return resolved_file
+        if not _raise_exceptions_for_connection_errors:
+            return None
+        raise EnvironmentError(
+            f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this file, couldn't find it in the"
+            f" cached files and it looks like {path_or_repo_id} is not the path to a directory containing a file named"
+            f" {full_filename}.\nCheckout your internet connection or see how to run the library in offline mode at"
+            " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
         )
-    except EnvironmentError:
-        # The repo and revision exist, but the file does not or there was a connection error fetching it.
-        return None
 
     return resolved_file
 
 
+def get_file_from_repo(
+    path_or_repo: Union[str, os.PathLike],
+    filename: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+):
+    """
+    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
+
+    Args:
+        path_or_repo (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a model repo on huggingface.co.
+            - a path to a *directory* potentially containing the file.
+        filename (`str`):
+            The name of the file to locate in `path_or_repo`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+
+    <Tip>
+
+    Passing `use_auth_token=True` is required when you want to use a private model.
+
+    </Tip>
+
+    Returns:
+        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo) or `None` if the
+        file does not exist.
+
+    Examples:
+
+    ```python
+    # Download a tokenizer configuration from huggingface.co and cache.
+    tokenizer_config = get_file_from_repo("bert-base-uncased", "tokenizer_config.json")
+    # This model does not have a tokenizer config so the result will be None.
+    tokenizer_config = get_file_from_repo("xlm-roberta-base", "tokenizer_config.json")
+    ```"""
+    return cached_file(
+        path_or_repo_id=path_or_repo,
+        filename=filename,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        use_auth_token=use_auth_token,
+        revision=revision,
+        local_files_only=local_files_only,
+        subfolder=subfolder,
+        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
+    )
+
+
 def has_file(
     path_or_repo: Union[str, os.PathLike],
     filename: str,
@@ -766,7 +948,7 @@ def has_file(
 
     r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=10)
     try:
-        _raise_for_status(r)
+        huggingface_hub.utils._errors._raise_for_status(r)
         return True
     except RepositoryNotFoundError as e:
         logger.error(e)
@@ -1196,3 +1378,183 @@ def get_checkpoint_shard_files(
         cached_filenames.append(cached_filename)
 
     return cached_filenames, sharded_metadata
+
+
+# All what is below is for conversion between old cache format and new cache format.
+
+
+def get_all_cached_files(cache_dir=None):
+    """
+    Returns a list for all files cached with appropriate metadata.
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    else:
+        cache_dir = str(cache_dir)
+
+    cached_files = []
+    for file in os.listdir(cache_dir):
+        meta_path = os.path.join(cache_dir, f"{file}.json")
+        if not os.path.isfile(meta_path):
+            continue
+
+        with open(meta_path, encoding="utf-8") as meta_file:
+            metadata = json.load(meta_file)
+            url = metadata["url"]
+            etag = metadata["etag"].replace('"', "")
+            cached_files.append({"file": file, "url": url, "etag": etag})
+
+    return cached_files
+
+
+def get_hub_metadata(url, token=None):
+    """
+    Returns the commit hash and associated etag for a given url.
+    """
+    if token is None:
+        token = HfFolder.get_token()
+    headers = {"user-agent": http_user_agent()}
+    headers["authorization"] = f"Bearer {token}"
+
+    r = huggingface_hub.file_download._request_with_retry(
+        method="HEAD", url=url, headers=headers, allow_redirects=False
+    )
+    huggingface_hub.file_download._raise_for_status(r)
+    commit_hash = r.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT)
+    etag = r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get("ETag")
+    if etag is not None:
+        etag = huggingface_hub.file_download._normalize_etag(etag)
+    return etag, commit_hash
+
+
+def extract_info_from_url(url):
+    """
+    Extract repo_name, revision and filename from an url.
+    """
+    search = re.search(r"^https://huggingface\.co/(.*)/resolve/([^/]*)/(.*)$", url)
+    if search is None:
+        return None
+    repo, revision, filename = search.groups()
+    cache_repo = "--".join(["models"] + repo.split("/"))
+    return {"repo": cache_repo, "revision": revision, "filename": filename}
+
+
+def clean_files_for(file):
+    """
+    Remove, if they exist, file, file.json and file.lock
+    """
+    for f in [file, f"{file}.json", f"{file}.lock"]:
+        if os.path.isfile(f):
+            os.remove(f)
+
+
+def move_to_new_cache(file, repo, filename, revision, etag, commit_hash):
+    """
+    Move file to repo following the new huggingface hub cache organization.
+    """
+    os.makedirs(repo, exist_ok=True)
+
+    # refs
+    os.makedirs(os.path.join(repo, "refs"), exist_ok=True)
+    if revision != commit_hash:
+        ref_path = os.path.join(repo, "refs", revision)
+        with open(ref_path, "w") as f:
+            f.write(commit_hash)
+
+    # blobs
+    os.makedirs(os.path.join(repo, "blobs"), exist_ok=True)
+    # TODO: replace copy by move when all works well.
+    blob_path = os.path.join(repo, "blobs", etag)
+    shutil.move(file, blob_path)
+
+    # snapshots
+    os.makedirs(os.path.join(repo, "snapshots"), exist_ok=True)
+    os.makedirs(os.path.join(repo, "snapshots", commit_hash), exist_ok=True)
+    pointer_path = os.path.join(repo, "snapshots", commit_hash, filename)
+    huggingface_hub.file_download._create_relative_symlink(blob_path, pointer_path)
+    clean_files_for(file)
+
+
+def move_cache(cache_dir=None, token=None):
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if token is None:
+        token = HfFolder.get_token()
+    cached_files = get_all_cached_files(cache_dir=cache_dir)
+    print(f"Moving {len(cached_files)} files to the new cache system")
+
+    hub_metadata = {}
+    for file_info in tqdm(cached_files):
+        url = file_info.pop("url")
+        if url not in hub_metadata:
+            try:
+                hub_metadata[url] = get_hub_metadata(url, token=token)
+            except requests.HTTPError:
+                continue
+
+        etag, commit_hash = hub_metadata[url]
+        if etag is None or commit_hash is None:
+            continue
+
+        if file_info["etag"] != etag:
+            # Cached file is not up to date, we just throw it as a new version will be downloaded anyway.
+            clean_files_for(os.path.join(cache_dir, file_info["file"]))
+            continue
+
+        url_info = extract_info_from_url(url)
+        if url_info is None:
+            # Not a file from huggingface.co
+            continue
+
+        repo = os.path.join(cache_dir, url_info["repo"])
+        move_to_new_cache(
+            file=os.path.join(cache_dir, file_info["file"]),
+            repo=repo,
+            filename=url_info["filename"],
+            revision=url_info["revision"],
+            etag=etag,
+            commit_hash=commit_hash,
+        )
+
+
+cache_version_file = os.path.join(TRANSFORMERS_CACHE, "version.txt")
+if not os.path.isfile(cache_version_file):
+    cache_version = 0
+else:
+    with open(cache_version_file) as f:
+        cache_version = int(f.read())
+
+
+if cache_version < 1:
+    if is_offline_mode():
+        logger.warn(
+            "You are offline and the cache for model files in Transformers v4.22.0 has been updated while your local "
+            "cache seems to be the one of a previous version. It is very likely that all your calls to any "
+            "`from_pretrained()` method will fail. Remove the offline mode and enable internet connection to have "
+            "your cache be updated automatically, then you can go back to offline mode."
+        )
+    else:
+        logger.warn(
+            "The cache for model files in Transformers v4.22.0 has been udpated. Migrating your old cache. This is a "
+            "one-time only operation. You can interrupt this and resume the migration later on by calling "
+            "`transformers.utils.move_cache()`."
+        )
+    try:
+        move_cache()
+    except Exception as e:
+        trace = "\n".join(traceback.format_tb(e.__traceback__))
+        logger.error(
+            f"There was a problem when trying to move your cache:\n\n{trace}\n\nPlease file an issue at "
+            "https://github.com/huggingface/transformers/issues/new/choose and copy paste this whole message and we "
+            "will do our best to help."
+        )
+
+    try:
+        os.makedirs(TRANSFORMERS_CACHE, exist_ok=True)
+        with open(cache_version_file, "w") as f:
+            f.write("1")
+    except Exception:
+        logger.warn(
+            f"There was a problem when trying to write in your cache folder ({TRANSFORMERS_CACHE}). You should set "
+            "the environment variable TRANSFORMERS_CACHE to a writable directory."
+        )
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index b6c8ed77dc3571..397346c7deec77 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -345,14 +345,14 @@ def test_cached_files_are_used_when_internet_is_down(self):
         # A mock response for an HTTP head request to emulate server down
         response_mock = mock.Mock()
         response_mock.status_code = 500
-        response_mock.headers = []
+        response_mock.headers = {}
         response_mock.raise_for_status.side_effect = HTTPError
 
         # Download this model to make sure it's in the cache.
         _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
 
         # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+        with mock.patch("requests.request", return_value=response_mock) as mock_head:
             _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
             # This check we did call the fake head request
             mock_head.assert_called()
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index a822b75cc5eb62..3ecf89a908672f 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -170,13 +170,13 @@ def test_cached_files_are_used_when_internet_is_down(self):
         # A mock response for an HTTP head request to emulate server down
         response_mock = mock.Mock()
         response_mock.status_code = 500
-        response_mock.headers = []
+        response_mock.headers = {}
         response_mock.raise_for_status.side_effect = HTTPError
 
         # Download this model to make sure it's in the cache.
         _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
         # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+        with mock.patch("requests.request", return_value=response_mock) as mock_head:
             _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
             # This check we did call the fake head request
             mock_head.assert_called()
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index c05771336e6365..8f80d7fa42f791 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2925,14 +2925,14 @@ def test_cached_files_are_used_when_internet_is_down(self):
         # A mock response for an HTTP head request to emulate server down
         response_mock = mock.Mock()
         response_mock.status_code = 500
-        response_mock.headers = []
+        response_mock.headers = {}
         response_mock.raise_for_status.side_effect = HTTPError
 
         # Download this model to make sure it's in the cache.
         _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
 
         # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+        with mock.patch("requests.request", return_value=response_mock) as mock_head:
             _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
             # This check we did call the fake head request
             mock_head.assert_called()
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 15855e6a1f40e6..abf26af2b65116 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1922,14 +1922,14 @@ def test_cached_files_are_used_when_internet_is_down(self):
         # A mock response for an HTTP head request to emulate server down
         response_mock = mock.Mock()
         response_mock.status_code = 500
-        response_mock.headers = []
+        response_mock.headers = {}
         response_mock.raise_for_status.side_effect = HTTPError
 
         # Download this model to make sure it's in the cache.
         _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
 
         # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+        with mock.patch("requests.request", return_value=response_mock) as mock_head:
             _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
             # This check we did call the fake head request
             mock_head.assert_called()
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index e1ed8530fdbdea..5941a571189960 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -3829,14 +3829,14 @@ def test_cached_files_are_used_when_internet_is_down(self):
         # A mock response for an HTTP head request to emulate server down
         response_mock = mock.Mock()
         response_mock.status_code = 500
-        response_mock.headers = []
+        response_mock.headers = {}
         response_mock.raise_for_status.side_effect = HTTPError
 
         # Download this model to make sure it's in the cache.
         _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
 
         # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+        with mock.patch("requests.request", return_value=response_mock) as mock_head:
             _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
             # This check we did call the fake head request
             mock_head.assert_called()

From 7472b39fc3a403fca953348ad8b72990b4437e9f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 5 Aug 2022 16:49:54 +0200
Subject: [PATCH 063/162] Fix `test_dbmdz_english` by updating expected values
 (#18482)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../test_pipelines_token_classification.py    | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index 1d71529cdfee6b..bc4eaef06255e3 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -284,9 +284,9 @@ def test_dbmdz_english(self):
         self.assertEqual(
             nested_simplify(output),
             [
-                {"entity": "I-PER", "score": 0.997, "word": "En", "start": 0, "end": 2, "index": 1},
-                {"entity": "I-PER", "score": 0.996, "word": "##zo", "start": 2, "end": 4, "index": 2},
-                {"entity": "I-ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24, "index": 7},
+                {"entity": "I-PER", "score": 0.998, "word": "En", "start": 0, "end": 2, "index": 1},
+                {"entity": "I-PER", "score": 0.997, "word": "##zo", "start": 2, "end": 4, "index": 2},
+                {"entity": "I-ORG", "score": 0.999, "word": "UN", "start": 18, "end": 20, "index": 6},
             ],
         )
 
@@ -295,8 +295,8 @@ def test_dbmdz_english(self):
         self.assertEqual(
             nested_simplify(output),
             [
-                {"entity_group": "PER", "score": 0.996, "word": "Enzo", "start": 0, "end": 4},
-                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24},
+                {"entity_group": "PER", "score": 0.997, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 18, "end": 20},
             ],
         )
 
@@ -305,8 +305,8 @@ def test_dbmdz_english(self):
         self.assertEqual(
             nested_simplify(output[:3]),
             [
-                {"entity_group": "PER", "score": 0.997, "word": "Enzo", "start": 0, "end": 4},
-                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24},
+                {"entity_group": "PER", "score": 0.998, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 18, "end": 20},
             ],
         )
 
@@ -315,8 +315,8 @@ def test_dbmdz_english(self):
         self.assertEqual(
             nested_simplify(output[:3]),
             [
-                {"entity_group": "PER", "score": 0.997, "word": "Enzo", "start": 0, "end": 4},
-                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24},
+                {"entity_group": "PER", "score": 0.998, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 18, "end": 20},
             ],
         )
 
@@ -325,8 +325,8 @@ def test_dbmdz_english(self):
         self.assertEqual(
             nested_simplify(output),
             [
-                {"entity_group": "PER", "score": 0.996, "word": "Enzo", "start": 0, "end": 4},
-                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24},
+                {"entity_group": "PER", "score": 0.997, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 18, "end": 20},
             ],
         )
 

From 35a534af956bb613c0eba5778b38377957d6f31e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 5 Aug 2022 13:14:00 -0400
Subject: [PATCH 064/162] Move cache folder to huggingface/hub for consistency
 with hf_hub (#18492)

* Move cache folder to just huggingface

* Thank you VsCode for this needless import

* Move to hub

* Forgot one
---
 docs/source/en/installation.mdx |  8 ++++----
 src/transformers/utils/hub.py   | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/installation.mdx b/docs/source/en/installation.mdx
index f20490115842c3..4ff4e04436c74e 100644
--- a/docs/source/en/installation.mdx
+++ b/docs/source/en/installation.mdx
@@ -139,11 +139,11 @@ conda install -c huggingface transformers
 
 ## Cache setup
 
-Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/transformers/`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\transformers`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory:
+Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hub`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory:
 
-1. Shell environment variable (default): `TRANSFORMERS_CACHE`.
-2. Shell environment variable: `HF_HOME` + `transformers/`.
-3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface/transformers`.
+1. Shell environment variable (default): `HUGGINGFACE_HUB_CACHE` or `TRANSFORMERS_CACHE`.
+2. Shell environment variable: `HF_HOME`.
+3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface`.
 
 <Tip>
 
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 9e81654cda7e1a..7fa4c0a151ace4 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -81,7 +81,7 @@ def is_offline_mode():
 hf_cache_home = os.path.expanduser(
     os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
 )
-default_cache_path = os.path.join(hf_cache_home, "transformers")
+default_cache_path = os.path.join(hf_cache_home, "hub")
 
 # Onetime move from the old location to the new one if no ENV variable has been set.
 if (
@@ -102,7 +102,8 @@ def is_offline_mode():
 
 PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
 PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
-TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
+HUGGINGFACE_HUB_CACHE = os.getenv("HUGGINGFACE_HUB_CACHE", PYTORCH_TRANSFORMERS_CACHE)
+TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", HUGGINGFACE_HUB_CACHE)
 HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
 TRANSFORMERS_DYNAMIC_MODULE_NAME = "transformers_modules"
 SESSION_ID = uuid4().hex
@@ -1475,9 +1476,16 @@ def move_to_new_cache(file, repo, filename, revision, etag, commit_hash):
     clean_files_for(file)
 
 
-def move_cache(cache_dir=None, token=None):
+def move_cache(cache_dir=None, new_cache_dir=None, token=None):
+    if new_cache_dir is None:
+        new_cache_dir = TRANSFORMERS_CACHE
     if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
+        # Migrate from old cache in .cache/huggingface/hub
+        old_cache = Path(TRANSFORMERS_CACHE).parent / "transformers"
+        if os.path.isdir(str(old_cache)):
+            cache_dir = str(old_cache)
+        else:
+            cache_dir = new_cache_dir
     if token is None:
         token = HfFolder.get_token()
     cached_files = get_all_cached_files(cache_dir=cache_dir)

From 2c96675d0ef426abf7c8af04baa93d67a44e4d6c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 5 Aug 2022 19:17:51 +0200
Subject: [PATCH 065/162] Update some expected values in `quicktour.mdx` for
 `resampy 0.3.0` (#18484)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docs/source/en/quicktour.mdx | 2 +-
 docs/source/es/quicktour.mdx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx
index dcadf6da34884f..c5b333bf5694fe 100644
--- a/docs/source/en/quicktour.mdx
+++ b/docs/source/en/quicktour.mdx
@@ -136,7 +136,7 @@ Let's extract the raw waveform arrays of the first 4 samples and pass it as a li
 ```py
 >>> result = speech_recognizer(dataset[:4]["audio"])
 >>> print([d["text"] for d in result])
-['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I TURN A JOIN A COUNT']
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
 ```
 
 For a larger dataset where the inputs are big (like in speech or vision), you will want to pass along a generator instead of a list that loads all the inputs in memory. See the [pipeline documentation](./main_classes/pipelines) for more information.
diff --git a/docs/source/es/quicktour.mdx b/docs/source/es/quicktour.mdx
index 9de9e9af4b6e55..408c3fa375a074 100644
--- a/docs/source/es/quicktour.mdx
+++ b/docs/source/es/quicktour.mdx
@@ -129,7 +129,7 @@ Extraigamos las matrices de onda cruda (raw waveform, en inglés) de las primera
 ```py
 >>> resultado = reconocedor_de_voz(dataset[:4]["audio"])
 >>> print([d["text"] for d in resultado])
-['ahora buenas e a ver  tengo un problema como vuestra aplicación resulta que que quiero hacer una transferencia bancaria a una cuenta conocida pero me da error la aplicación a ver que a ver que puede ser', 'la aplicación no cargue salda de mi nueva cuenta', 'hola tengo un problema con la aplicación no carga y y tampoco veo que carga el saldo de mi cuenta nueva dice que la aplicación está siendo reparada y ahora no puedo aceder a mi cuenta no necesito inmediatamente', 'ora buena la aplicación no se carga la viladad no carga el saldo de mi cuenta nueva dice que la villadenta siendo reparada y oro no puede hacer a mi cuenta']
+['ahora buenas eh a ver tengo un problema con vuestra aplicación resulta que que quiero hacer una transferencia bancaria a una cuenta conocida pero me da error la aplicación a ver que a ver que puede ser', 'la aplicación no cargue saldo de mi nueva cuenta', 'hola tengo un problema con la aplicación no carga y y tampoco veo que carga el saldo de mi cuenta nueva dice que la aplicación está siendo reparada y ahora no puedo acceder a mi cuenta no necesito inmediatamente', 'hora buena la aplicación no se carga la vileza no carga el saldo de mi cuenta nueva dice que la villadenta siendo reparada y oro no puedo hacer a mi cuenta']
 ```
 
 Para un dataset más grande, donde los inputs son de mayor tamaño (como en habla/audio o visión), querrás pasar un generador en lugar de una lista que carga todos los inputs en memoria. Ve la [documentación del pipeline](./main_classes/pipelines) para más información.

From fc879699c4757b7db2e687f49414330bc16465d4 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Fri, 5 Aug 2022 13:24:53 -0400
Subject: [PATCH 066/162] Forgot one new_ for cache migration

---
 src/transformers/utils/hub.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 7fa4c0a151ace4..81fd4151653194 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -1514,7 +1514,7 @@ def move_cache(cache_dir=None, new_cache_dir=None, token=None):
             # Not a file from huggingface.co
             continue
 
-        repo = os.path.join(cache_dir, url_info["repo"])
+        repo = os.path.join(new_cache_dir, url_info["repo"])
         move_to_new_cache(
             file=os.path.join(cache_dir, file_info["file"]),
             repo=repo,

From e8f57723a362d27c664e179d03f9a51af8e4536f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 5 Aug 2022 19:27:19 +0200
Subject: [PATCH 067/162] disable Onnx test for google/long-t5-tglobal-base
 (#18454)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/onnx/test_onnx_v2.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index c15910734f3adb..cfc58dd335c30d 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -228,7 +228,9 @@ def test_values_override(self):
     ("blenderbot", "facebook/blenderbot-400M-distill"),
     ("bigbird-pegasus", "google/bigbird-pegasus-large-arxiv"),
     ("longt5", "google/long-t5-local-base"),
-    ("longt5", "google/long-t5-tglobal-base"),
+    # Disable for now as it causes fatal error `Floating point exception (core dumped)` and the subsequential tests are
+    # not run.
+    # ("longt5", "google/long-t5-tglobal-base"),
 }
 
 # TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_MODELS` once TensorFlow has parity with the PyTorch model implementations.

From 707c0ff5f0ff170dc71a76d7235fa69c5141d5e3 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <julien@huggingface.co>
Date: Fri, 5 Aug 2022 19:29:38 +0200
Subject: [PATCH 068/162] Typo reported by Joel Grus on TWTR (#18493)

---
 src/transformers/utils/hub.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 81fd4151653194..2488ab8f690865 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -1543,7 +1543,7 @@ def move_cache(cache_dir=None, new_cache_dir=None, token=None):
         )
     else:
         logger.warn(
-            "The cache for model files in Transformers v4.22.0 has been udpated. Migrating your old cache. This is a "
+            "The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a "
             "one-time only operation. You can interrupt this and resume the migration later on by calling "
             "`transformers.utils.move_cache()`."
         )

From 0ff35a8e6e6f770d4118143eb1a7920dc2e0c6ba Mon Sep 17 00:00:00 2001
From: Julien Chaumond <julien@huggingface.co>
Date: Sat, 6 Aug 2022 09:38:55 +0200
Subject: [PATCH 069/162] =?UTF-8?q?Just=20re-reading=20the=20whole=20doc?=
 =?UTF-8?q?=20every=20couple=20of=20months=20=F0=9F=98=AC=20(#18489)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Delete valohai.yaml

* NLP => ML

* typo

* website supports https

* datasets

* 60k + modalities

* unrelated link fixing for accelerate

* Ok those links were actually broken

* Fix link

* Make `AutoTokenizer` auto-link

* wording tweak

* add at least one non-nlp task
---
 README.md                                     |  8 +-
 docs/source/en/accelerate.mdx                 | 10 +-
 docs/source/en/model_sharing.mdx              |  2 +-
 docs/source/en/perf_train_gpu_one.mdx         |  2 +-
 docs/source/en/pipeline_tutorial.mdx          |  2 +-
 docs/source/en/run_scripts.mdx                |  2 +-
 docs/source/en/task_summary.mdx               |  2 +-
 docs/source/es/accelerate.mdx                 | 10 +-
 docs/source/es/model_sharing.mdx              |  2 +-
 docs/source/es/run_scripts.mdx                |  2 +-
 docs/source/it/accelerate.mdx                 | 10 +-
 docs/source/it/model_sharing.mdx              |  2 +-
 docs/source/it/run_scripts.mdx                |  2 +-
 docs/source/pt/accelerate.mdx                 | 10 +-
 examples/legacy/pytorch-lightning/run_ner.sh  |  2 +-
 .../legacy/token-classification/README.md     |  2 +-
 examples/legacy/token-classification/run.sh   |  2 +-
 examples/pytorch/README.md                    |  4 +-
 examples/tensorflow/README.md                 |  2 +-
 valohai.yaml                                  | 91 -------------------
 20 files changed, 39 insertions(+), 130 deletions(-)
 delete mode 100644 valohai.yaml

diff --git a/README.md b/README.md
index 0cda209bdfc32c..46a4b07c14cd32 100644
--- a/README.md
+++ b/README.md
@@ -157,7 +157,7 @@ Here we get a list of objects detected in the image, with a box surrounding the
 
 You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/docs/transformers/task_summary).
 
-To download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version:
+In addition to `pipeline`, to download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version:
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
@@ -181,7 +181,7 @@ And here is the equivalent code for TensorFlow:
 
 The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.
 
-The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use normally. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.
+The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use as usual. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.
 
 ## Why should I use transformers?
 
@@ -194,7 +194,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 1. Lower compute costs, smaller carbon footprint:
     - Researchers can share trained models instead of always retraining.
     - Practitioners can reduce compute time and production costs.
-    - Dozens of architectures with over 20,000 pretrained models, some in more than 100 languages.
+    - Dozens of architectures with over 60,000 pretrained models across all modalities.
 
 1. Choose the right framework for every part of a model's lifetime:
     - Train state-of-the-art models in 3 lines of code.
@@ -209,7 +209,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 ## Why shouldn't I use transformers?
 
 - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
-- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library.
+- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library (possibly, [Accelerate](https://huggingface.co/docs/accelerate)).
 - While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.
 
 ## Installation
diff --git a/docs/source/en/accelerate.mdx b/docs/source/en/accelerate.mdx
index 58b6e6958fa2d6..c215758d47b6a3 100644
--- a/docs/source/en/accelerate.mdx
+++ b/docs/source/en/accelerate.mdx
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # Distributed training with 🤗 Accelerate
 
-As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate/index.html) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
+As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
 
 ## Setup
 
@@ -22,7 +22,7 @@ Get started by installing 🤗 Accelerate:
 pip install accelerate
 ```
 
-Then import and create an [`Accelerator`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator) object. `Accelerator` will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
+Then import and create an [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) object. `Accelerator` will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
 
 ```py
 >>> from accelerate import Accelerator
@@ -32,7 +32,7 @@ Then import and create an [`Accelerator`](https://huggingface.co/docs/accelerate
 
 ## Prepare to accelerate
 
-The next step is to pass all the relevant training objects to the [`prepare`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.prepare) method. This includes your training and evaluation DataLoaders, a model and an optimizer:
+The next step is to pass all the relevant training objects to the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) method. This includes your training and evaluation DataLoaders, a model and an optimizer:
 
 ```py
 >>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
@@ -42,7 +42,7 @@ The next step is to pass all the relevant training objects to the [`prepare`](ht
 
 ## Backward
 
-The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`backward`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.backward) method:
+The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) method:
 
 ```py
 >>> for epoch in range(num_epochs):
@@ -129,4 +129,4 @@ accelerate launch train.py
 >>> notebook_launcher(training_function)
 ```
 
-For more information about 🤗 Accelerate and it's rich features, refer to the [documentation](https://huggingface.co/docs/accelerate/index.html).
\ No newline at end of file
+For more information about 🤗 Accelerate and it's rich features, refer to the [documentation](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/en/model_sharing.mdx b/docs/source/en/model_sharing.mdx
index 24da63348c8a83..e6bd7fc4a6afe2 100644
--- a/docs/source/en/model_sharing.mdx
+++ b/docs/source/en/model_sharing.mdx
@@ -225,4 +225,4 @@ To make sure users understand your model's capabilities, limitations, potential
 * Manually creating and uploading a `README.md` file.
 * Clicking on the **Edit model card** button in your model repository.
 
-Take a look at the DistilBert [model card](https://huggingface.co/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/model-repos).
+Take a look at the DistilBert [model card](https://huggingface.co/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.mdx
index 0c130b41722388..ba5bcb456d2220 100644
--- a/docs/source/en/perf_train_gpu_one.mdx
+++ b/docs/source/en/perf_train_gpu_one.mdx
@@ -609,7 +609,7 @@ for step, batch in enumerate(dataloader, start=1):
         optimizer.zero_grad()
 ```
 
-First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator) we can specifiy if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. During the [`prepare`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.prepare) call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same 8-bit optimizer from the earlier experiments.
+First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) we can specifiy if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same 8-bit optimizer from the earlier experiments.
 
 Finally, we can write the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see how gradient accumulation works: we normalize the loss so we get the average at the end of accumulation and once we have enough steps we run the optimization. Now the question is: does this use the same amount of memory as the previous steps? Let's check:
 
diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
index 274f97f0d6cc92..7929113209748d 100644
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.mdx
@@ -67,7 +67,7 @@ Any additional parameters for your task can also be included in the [`pipeline`]
 
 ### Choose a model and tokenizer
 
-The [`pipeline`] accepts any model from the [Model Hub](https://huggingface.co/models). There are tags on the Model Hub that allow you to filter for a model you'd like to use for your task. Once you've picked an appropriate model, load it with the corresponding `AutoModelFor` and [`AutoTokenizer'] class. For example, load the [`AutoModelForCausalLM`] class for a causal language modeling task:
+The [`pipeline`] accepts any model from the [Model Hub](https://huggingface.co/models). There are tags on the Model Hub that allow you to filter for a model you'd like to use for your task. Once you've picked an appropriate model, load it with the corresponding `AutoModelFor` and [`AutoTokenizer`] class. For example, load the [`AutoModelForCausalLM`] class for a causal language modeling task:
 
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
diff --git a/docs/source/en/run_scripts.mdx b/docs/source/en/run_scripts.mdx
index 368bd910efc762..58d6b8dd3e208c 100644
--- a/docs/source/en/run_scripts.mdx
+++ b/docs/source/en/run_scripts.mdx
@@ -187,7 +187,7 @@ python run_summarization.py  \
 
 ## Run a script with 🤗 Accelerate
 
-🤗 [Accelerate](https://huggingface.co/docs/accelerate/index.html) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it:
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it:
 
 > Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts
 ```bash
diff --git a/docs/source/en/task_summary.mdx b/docs/source/en/task_summary.mdx
index 27781ccc0503f0..18c442ac2abb02 100644
--- a/docs/source/en/task_summary.mdx
+++ b/docs/source/en/task_summary.mdx
@@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
 
 This page shows the most frequent use-cases when using the library. The models available allow for many different
 configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage for
-tasks such as question answering, sequence classification, named entity recognition and others.
+tasks such as image classification, question answering, sequence classification, named entity recognition and others.
 
 These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
 automatically selecting the correct model architecture. Please check the [`AutoModel`] documentation
diff --git a/docs/source/es/accelerate.mdx b/docs/source/es/accelerate.mdx
index 43482106dc223e..6065bc110a1d71 100644
--- a/docs/source/es/accelerate.mdx
+++ b/docs/source/es/accelerate.mdx
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # Entrenamiento distribuido con 🤗 Accelerate
 
-El paralelismo ha emergido como una estrategia para entrenar modelos grandes en hardware limitado e incrementar la velocidad de entrenamiento en varios órdenes de magnitud. En Hugging Face creamos la biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate/index.html) para ayudar a los usuarios a entrenar modelos 🤗 Transformers en cualquier tipo de configuración distribuida, ya sea en una máquina con múltiples GPUs o en múltiples GPUs distribuidas entre muchas máquinas. En este tutorial aprenderás cómo personalizar tu bucle de entrenamiento de PyTorch nativo para poder entrenar en entornos distribuidos.
+El paralelismo ha emergido como una estrategia para entrenar modelos grandes en hardware limitado e incrementar la velocidad de entrenamiento en varios órdenes de magnitud. En Hugging Face creamos la biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate) para ayudar a los usuarios a entrenar modelos 🤗 Transformers en cualquier tipo de configuración distribuida, ya sea en una máquina con múltiples GPUs o en múltiples GPUs distribuidas entre muchas máquinas. En este tutorial aprenderás cómo personalizar tu bucle de entrenamiento de PyTorch nativo para poder entrenar en entornos distribuidos.
 
 ## Configuración
 
@@ -22,7 +22,7 @@ Empecemos por instalar 🤗 Accelerate:
 pip install accelerate
 ```
 
-Luego, importamos y creamos un objeto [`Accelerator`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator). `Accelerator` detectará automáticamente el tipo de configuración distribuida que tengas disponible e inicializará todos los componentes necesarios para el entrenamiento. No necesitas especificar el dispositivo en donde se debe colocar tu modelo.
+Luego, importamos y creamos un objeto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` detectará automáticamente el tipo de configuración distribuida que tengas disponible e inicializará todos los componentes necesarios para el entrenamiento. No necesitas especificar el dispositivo en donde se debe colocar tu modelo.
 
 ```py
 >>> from accelerate import Accelerator
@@ -32,7 +32,7 @@ Luego, importamos y creamos un objeto [`Accelerator`](https://huggingface.co/doc
 
 ## Prepárate para acelerar
 
-Pasa todos los objetos relevantes para el entrenamiento al método [`prepare`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.prepare). Esto incluye los DataLoaders de entrenamiento y evaluación, un modelo y un optimizador:
+Pasa todos los objetos relevantes para el entrenamiento al método [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Esto incluye los DataLoaders de entrenamiento y evaluación, un modelo y un optimizador:
 
 ```py
 >>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
@@ -42,7 +42,7 @@ Pasa todos los objetos relevantes para el entrenamiento al método [`prepare`](h
 
 ## Backward
 
-Por último, reemplaza el típico `loss.backward()` en tu bucle de entrenamiento con el método [`backward`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.backward) de 🤗 Accelerate:
+Por último, reemplaza el típico `loss.backward()` en tu bucle de entrenamiento con el método [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) de 🤗 Accelerate:
 
 ```py
 >>> for epoch in range(num_epochs):
@@ -129,4 +129,4 @@ accelerate launch train.py
 >>> notebook_launcher(training_function)
 ```
 
-Para obtener más información sobre 🤗 Accelerate y sus numerosas funciones, consulta la [documentación](https://huggingface.co/docs/accelerate/index.html).
+Para obtener más información sobre 🤗 Accelerate y sus numerosas funciones, consulta la [documentación](https://huggingface.co/docs/accelerate).
diff --git a/docs/source/es/model_sharing.mdx b/docs/source/es/model_sharing.mdx
index 072b80ab398b85..cf3215dc86d742 100644
--- a/docs/source/es/model_sharing.mdx
+++ b/docs/source/es/model_sharing.mdx
@@ -216,4 +216,4 @@ Para asegurarnos que los usuarios entiendan las capacidades de tu modelo, sus li
 * Elaborando y subiendo manualmente el archivo`README.md`.
 * Dando click en el botón **Edit model card** dentro del repositorio.
 
-Toma un momento para ver la [tarjeta de modelo](https://huggingface.co/distilbert-base-uncased) de DistilBert para que tengas un buen ejemplo del tipo de información que debería incluir. Consulta [la documentación](https://huggingface.co/docs/hub/model-repos) para más detalles acerca de otras opciones que puedes controlar dentro del archivo `README.md` como la huella de carbono del modelo o ejemplos de widgets. Consulta la documentación [aquí] (https://huggingface.co/docs/hub/model-repos).
+Toma un momento para ver la [tarjeta de modelo](https://huggingface.co/distilbert-base-uncased) de DistilBert para que tengas un buen ejemplo del tipo de información que debería incluir. Consulta [la documentación](https://huggingface.co/docs/hub/models-cards) para más detalles acerca de otras opciones que puedes controlar dentro del archivo `README.md` como la huella de carbono del modelo o ejemplos de widgets. Consulta la documentación [aquí] (https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/es/run_scripts.mdx b/docs/source/es/run_scripts.mdx
index 9c107408456f14..73dd1ba320c1f6 100644
--- a/docs/source/es/run_scripts.mdx
+++ b/docs/source/es/run_scripts.mdx
@@ -187,7 +187,7 @@ python run_summarization.py  \
 
 ## Ejecutar un script con 🤗 Accelerate
 
-🤗 [Accelerate](https://huggingface.co/docs/accelerate/index.html) es una biblioteca exclusiva de PyTorch que ofrece un método unificado para entrenar un modelo en varios tipos de configuraciones (solo CPU, GPU múltiples, TPU) mientras mantiene una visibilidad completa en el ciclo de entrenamiento de PyTorch. Asegúrate de tener 🤗 Accelerate instalado si aún no lo tienes:
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) es una biblioteca exclusiva de PyTorch que ofrece un método unificado para entrenar un modelo en varios tipos de configuraciones (solo CPU, GPU múltiples, TPU) mientras mantiene una visibilidad completa en el ciclo de entrenamiento de PyTorch. Asegúrate de tener 🤗 Accelerate instalado si aún no lo tienes:
 
 > Nota: Como Accelerate se está desarrollando rápidamente, debes instalar la versión git de Accelerate para ejecutar los scripts
 ```bash
diff --git a/docs/source/it/accelerate.mdx b/docs/source/it/accelerate.mdx
index 75abf65c7fcd1f..20dc1a7ff90b53 100644
--- a/docs/source/it/accelerate.mdx
+++ b/docs/source/it/accelerate.mdx
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # Allenamento distribuito con 🤗 Accelerate
 
-La parallelizzazione è emersa come strategia per allenare modelli sempre più grandi su hardware limitato e accelerarne la velocità di allenamento di diversi ordini di magnitudine. In Hugging Face, abbiamo creato la libreria [🤗 Accelerate](https://huggingface.co/docs/accelerate/index.html) per aiutarti ad allenare in modo semplice un modello 🤗 Transformers su qualsiasi tipo di configurazione distribuita, sia che si tratti di più GPU su una sola macchina o di più GPU su più macchine. In questo tutorial, imparerai come personalizzare il training loop nativo di PyTorch per consentire l'addestramento in un ambiente distribuito.
+La parallelizzazione è emersa come strategia per allenare modelli sempre più grandi su hardware limitato e accelerarne la velocità di allenamento di diversi ordini di magnitudine. In Hugging Face, abbiamo creato la libreria [🤗 Accelerate](https://huggingface.co/docs/accelerate) per aiutarti ad allenare in modo semplice un modello 🤗 Transformers su qualsiasi tipo di configurazione distribuita, sia che si tratti di più GPU su una sola macchina o di più GPU su più macchine. In questo tutorial, imparerai come personalizzare il training loop nativo di PyTorch per consentire l'addestramento in un ambiente distribuito.
 
 ## Configurazione
 
@@ -22,7 +22,7 @@ Inizia installando 🤗 Accelerate:
 pip install accelerate
 ```
 
-Poi importa e crea un oggetto [`Accelerator`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator). `Accelerator` rileverà automaticamente il tuo setup distribuito e inizializzerà tutte le componenti necessarie per l'allenamento. Non dovrai allocare esplicitamente il tuo modello su un device.
+Poi importa e crea un oggetto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator). `Accelerator` rileverà automaticamente il tuo setup distribuito e inizializzerà tutte le componenti necessarie per l'allenamento. Non dovrai allocare esplicitamente il tuo modello su un device.
 
 ```py
 >>> from accelerate import Accelerator
@@ -32,7 +32,7 @@ Poi importa e crea un oggetto [`Accelerator`](https://huggingface.co/docs/accele
 
 ## Preparati ad accelerare
 
-Il prossimo passo è quello di passare tutti gli oggetti rilevanti per l'allenamento al metodo [`prepare`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.prepare). Questo include i tuoi DataLoaders per l'allenamento e per la valutazione, un modello e un ottimizzatore:
+Il prossimo passo è quello di passare tutti gli oggetti rilevanti per l'allenamento al metodo [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare). Questo include i tuoi DataLoaders per l'allenamento e per la valutazione, un modello e un ottimizzatore:
 
 ```py
 >>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
@@ -42,7 +42,7 @@ Il prossimo passo è quello di passare tutti gli oggetti rilevanti per l'allenam
 
 ## Backward
 
-Infine, sostituisci il tipico metodo `loss.backward()` nel tuo loop di allenamento con il metodo [`backward`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.backward) di 🤗 Accelerate:
+Infine, sostituisci il tipico metodo `loss.backward()` nel tuo loop di allenamento con il metodo [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) di 🤗 Accelerate:
 
 ```py
 >>> for epoch in range(num_epochs):
@@ -129,4 +129,4 @@ La libreria 🤗 Accelerate può anche essere utilizzata in un notebook se stai
 >>> notebook_launcher(training_function)
 ```
 
-Per maggiori informazioni relative a 🤗 Accelerate e le sue numerose funzionalità, fai riferimento alla [documentazione](https://huggingface.co/docs/accelerate/index.html).
\ No newline at end of file
+Per maggiori informazioni relative a 🤗 Accelerate e le sue numerose funzionalità, fai riferimento alla [documentazione](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/it/model_sharing.mdx b/docs/source/it/model_sharing.mdx
index a60fe50b2ba578..87ba2b5b342140 100644
--- a/docs/source/it/model_sharing.mdx
+++ b/docs/source/it/model_sharing.mdx
@@ -231,4 +231,4 @@ Per assicurarti che chiunque possa comprendere le abilità, limitazioni, i poten
 * Creando manualmente e caricando un file `README.md`.
 * Premendo sul pulsante **Edit model card** nel repository del tuo modello.
 
-Dai un'occhiata alla [scheda del modello](https://huggingface.co/distilbert-base-uncased) di DistilBert per avere un buon esempio del tipo di informazioni che una scheda di un modello deve includere. Per maggiori dettagli legati ad altre opzioni che puoi controllare nel file `README.md`, come l'impatto ambientale o widget di esempio, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/model-repos).
+Dai un'occhiata alla [scheda del modello](https://huggingface.co/distilbert-base-uncased) di DistilBert per avere un buon esempio del tipo di informazioni che una scheda di un modello deve includere. Per maggiori dettagli legati ad altre opzioni che puoi controllare nel file `README.md`, come l'impatto ambientale o widget di esempio, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/it/run_scripts.mdx b/docs/source/it/run_scripts.mdx
index 4e3f639efb9dbf..3ffd58a62830aa 100644
--- a/docs/source/it/run_scripts.mdx
+++ b/docs/source/it/run_scripts.mdx
@@ -187,7 +187,7 @@ python run_summarization.py  \
 
 ## Esegui uno script con 🤗 Accelerate
 
-🤗 [Accelerate](https://huggingface.co/docs/accelerate/index.html) è una libreria compatibile solo con PyTorch che offre un metodo unificato per addestrare modelli su diverse tipologie di configurazioni (CPU, multiple GPU, TPU) mantenendo una completa visibilità rispetto al ciclo di training di PyTorch. Assicurati di aver effettuato l'installazione di 🤗 Accelerate, nel caso non lo avessi fatto:
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) è una libreria compatibile solo con PyTorch che offre un metodo unificato per addestrare modelli su diverse tipologie di configurazioni (CPU, multiple GPU, TPU) mantenendo una completa visibilità rispetto al ciclo di training di PyTorch. Assicurati di aver effettuato l'installazione di 🤗 Accelerate, nel caso non lo avessi fatto:
 
 > Nota: dato che Accelerate è in rapido sviluppo, è necessario installare la versione proveniente da git per eseguire gli script:
 ```bash
diff --git a/docs/source/pt/accelerate.mdx b/docs/source/pt/accelerate.mdx
index 0e2257faceff84..59dbd96a83b26a 100644
--- a/docs/source/pt/accelerate.mdx
+++ b/docs/source/pt/accelerate.mdx
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 # Treinamento distribuído com o 🤗 Accelerate
 
 O paralelismo surgiu como uma estratégia para treinar modelos grandes em hardware limitado e aumentar a velocidade
-de treinamento em várias órdens de magnitude. Na Hugging Face criamos a biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate/index.html)
+de treinamento em várias órdens de magnitude. Na Hugging Face criamos a biblioteca [🤗 Accelerate](https://huggingface.co/docs/accelerate)
 para ajudar os usuários a treinar modelos 🤗 Transformers com qualquer configuração distribuída, seja em uma máquina
 com múltiplos GPUs ou em múltiplos GPUs distribuidos entre muitas máquinas. Neste tutorial, você irá aprender como
 personalizar seu laço de treinamento de PyTorch para poder treinar em ambientes distribuídos.
@@ -26,7 +26,7 @@ De início, instale o 🤗 Accelerate:
 pip install accelerate
 ```
 
-Logo, devemos importar e criar um objeto [`Accelerator`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator).
+Logo, devemos importar e criar um objeto [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator).
 O `Accelerator` detectará automáticamente a configuração distribuída disponível e inicializará todos os
 componentes necessários para o treinamento. Não há necessidade portanto de especificar o dispositivo onde deve colocar seu modelo.
 
@@ -38,7 +38,7 @@ componentes necessários para o treinamento. Não há necessidade portanto de es
 
 ## Preparando a aceleração
 
-Passe todos os objetos relevantes ao treinamento para o método [`prepare`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.prepare).
+Passe todos os objetos relevantes ao treinamento para o método [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare).
 Isto inclui os DataLoaders de treino e evaluação, um modelo e um otimizador:
 
 ```py
@@ -49,7 +49,7 @@ Isto inclui os DataLoaders de treino e evaluação, um modelo e um otimizador:
 
 ## Backward
 
-Por último, substitua o `loss.backward()` padrão em seu laço de treinamento com o método [`backward`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.backward) do 🤗 Accelerate:
+Por último, substitua o `loss.backward()` padrão em seu laço de treinamento com o método [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) do 🤗 Accelerate:
 
 ```py
 >>> for epoch in range(num_epochs):
@@ -138,4 +138,4 @@ Encapsule o código responsável pelo treinamento de uma função e passe-o ao `
 >>> notebook_launcher(training_function)
 ```
 
-Para obter mais informações sobre o 🤗 Accelerate e suas numerosas funções, consulte a [documentación](https://huggingface.co/docs/accelerate/index.html).
+Para obter mais informações sobre o 🤗 Accelerate e suas numerosas funções, consulte a [documentación](https://huggingface.co/docs/accelerate/index).
diff --git a/examples/legacy/pytorch-lightning/run_ner.sh b/examples/legacy/pytorch-lightning/run_ner.sh
index 2913473eb8cdef..a5b185aa960d09 100755
--- a/examples/legacy/pytorch-lightning/run_ner.sh
+++ b/examples/legacy/pytorch-lightning/run_ner.sh
@@ -5,7 +5,7 @@ pip install -r ../requirements.txt
 
 ## The relevant files are currently on a shared Google
 ## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
-## Monitor for changes and eventually migrate to nlp dataset
+## Monitor for changes and eventually migrate to use the `datasets` library
 curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
 | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
 curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
diff --git a/examples/legacy/token-classification/README.md b/examples/legacy/token-classification/README.md
index cd9c1587032c54..c2fa6eec7282b2 100644
--- a/examples/legacy/token-classification/README.md
+++ b/examples/legacy/token-classification/README.md
@@ -291,4 +291,4 @@ On the test dataset the following results could be achieved:
 05/29/2020 23:34:02 - INFO - __main__ -     eval_f1 = 0.47440836543753434
 ```
 
-WNUT’17 is a very difficult task. Current state-of-the-art results on this dataset can be found [here](http://nlpprogress.com/english/named_entity_recognition.html).
+WNUT’17 is a very difficult task. Current state-of-the-art results on this dataset can be found [here](https://nlpprogress.com/english/named_entity_recognition.html).
diff --git a/examples/legacy/token-classification/run.sh b/examples/legacy/token-classification/run.sh
index f5cbf0d50e02ee..b5f1e5f83bc7ff 100755
--- a/examples/legacy/token-classification/run.sh
+++ b/examples/legacy/token-classification/run.sh
@@ -1,6 +1,6 @@
 ## The relevant files are currently on a shared Google
 ## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
-## Monitor for changes and eventually migrate to nlp dataset
+## Monitor for changes and eventually migrate to use the `datasets` library
 curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
 | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
 curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index 95d42bfc8b3812..442511ead93a7a 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -15,12 +15,12 @@ limitations under the License.
 
 # Examples
 
-This folder contains actively maintained examples of use of 🤗 Transformers using the PyTorch backend, organized along NLP tasks.
+This folder contains actively maintained examples of use of 🤗 Transformers using the PyTorch backend, organized by ML task.
 
 ## The Big Table of Tasks
 
 Here is the list of all our examples:
-- with information on whether they are **built on top of `Trainer``** (if not, they still work, they might
+- with information on whether they are **built on top of `Trainer`** (if not, they still work, they might
   just lack some features),
 - whether or not they have a version using the [🤗 Accelerate](https://github.com/huggingface/accelerate) library.
 - whether or not they leverage the [🤗 Datasets](https://github.com/huggingface/datasets) library.
diff --git a/examples/tensorflow/README.md b/examples/tensorflow/README.md
index 967a1a8b7869e4..7936e3d4650950 100644
--- a/examples/tensorflow/README.md
+++ b/examples/tensorflow/README.md
@@ -15,7 +15,7 @@ limitations under the License.
 
 # Examples
 
-This folder contains actively maintained examples of use of 🤗 Transformers organized into different NLP tasks. All examples in this folder are **TensorFlow** examples, and are written using native Keras rather than classes like `TFTrainer`, which we now consider deprecated. If you've previously only used 🤗 Transformers via `TFTrainer`, we highly recommend taking a look at the new style - we think it's a big improvement!
+This folder contains actively maintained examples of use of 🤗 Transformers organized into different ML tasks. All examples in this folder are **TensorFlow** examples, and are written using native Keras rather than classes like `TFTrainer`, which we now consider deprecated. If you've previously only used 🤗 Transformers via `TFTrainer`, we highly recommend taking a look at the new style - we think it's a big improvement!
 
 In addition, all scripts here now support the [🤗 Datasets](https://github.com/huggingface/datasets) library - you can grab entire datasets just by changing one command-line argument!
 
diff --git a/valohai.yaml b/valohai.yaml
deleted file mode 100644
index 14441e27d02d4e..00000000000000
--- a/valohai.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
----
-
-- step:
-    name: Execute python examples/text-classification/run_glue.py
-    image: pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
-    command:
-      - python /valohai/repository/utils/download_glue_data.py --data_dir=/glue_data
-      - pip install -e .
-      - pip install -r examples/requirements.txt
-      - python examples/text-classification/run_glue.py --do_train --data_dir=/glue_data/{parameter-value:task_name} {parameters}
-    parameters:
-      - name: model_type
-        pass-as: --model_type={v}
-        type: string
-        default: bert
-      - name: model_name_or_path
-        pass-as: --model_name_or_path={v}
-        type: string
-        default: bert-base-uncased
-      - name: task_name
-        pass-as: --task_name={v}
-        type: string
-        default: MRPC
-      - name: max_seq_length
-        pass-as: --max_seq_length={v}
-        description: The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.
-        type: integer
-        default: 128
-      - name: per_gpu_train_batch_size
-        pass-as: --per_gpu_train_batch_size={v}
-        description: Batch size per GPU/CPU for training.
-        type: integer
-        default: 8
-      - name: per_gpu_eval_batch_size
-        pass-as: --per_gpu_eval_batch_size={v}
-        description: Batch size per GPU/CPU for evaluation.
-        type: integer
-        default: 8
-      - name: gradient_accumulation_steps
-        pass-as: --gradient_accumulation_steps={v}
-        description: Number of updates steps to accumulate before performing a backward/update pass.
-        type: integer
-        default: 1
-      - name: learning_rate
-        pass-as: --learning_rate={v}
-        description: The initial learning rate for Adam.
-        type: float
-        default: 0.00005
-      - name: adam_epsilon
-        pass-as: --adam_epsilon={v}
-        description: Epsilon for Adam optimizer.
-        type: float
-        default: 0.00000001
-      - name: max_grad_norm
-        pass-as: --max_grad_norm={v}
-        description: Max gradient norm.
-        type: float
-        default: 1.0
-      - name: num_train_epochs
-        pass-as: --num_train_epochs={v}
-        description: Total number of training epochs to perform.
-        type: integer
-        default: 3
-      - name: max_steps
-        pass-as: --max_steps={v}
-        description: If > 0, set total number of training steps to perform. Override num_train_epochs.
-        type: integer
-        default: -1
-      - name: warmup_steps
-        pass-as: --warmup_steps={v}
-        description: Linear warmup over warmup_steps.
-        type: integer
-        default: -1
-      - name: logging_steps
-        pass-as: --logging_steps={v}
-        description: Log every X updates steps.
-        type: integer
-        default: 25
-      - name: save_steps
-        pass-as: --save_steps={v}
-        description: Save checkpoint every X updates steps.
-        type: integer
-        default: -1
-      - name: output_dir
-        pass-as: --output_dir={v}
-        type: string
-        default: /valohai/outputs
-      - name: evaluation_strategy
-        description: The evaluation strategy to use.
-        type: string
-        default: steps

From 1d346569d876c9f45d594c2063f10b9f311d240f Mon Sep 17 00:00:00 2001
From: Julien Chaumond <julien@huggingface.co>
Date: Sat, 6 Aug 2022 09:42:55 +0200
Subject: [PATCH 070/162] `transformers-cli login` => `huggingface-cli login`
 (#18490)

* zero chance anyone's using that constant no?

* `transformers-cli login` => `huggingface-cli login`

* `transformers-cli repo create` => `huggingface-cli repo create`

* `make style`
---
 .../flax/image-captioning/run_image_captioning_flax.py |  2 +-
 examples/flax/language-modeling/run_bart_dlm_flax.py   |  2 +-
 examples/flax/language-modeling/run_clm_flax.py        |  2 +-
 examples/flax/language-modeling/run_mlm_flax.py        |  2 +-
 examples/flax/language-modeling/run_t5_mlm_flax.py     |  2 +-
 examples/flax/question-answering/run_qa.py             |  2 +-
 examples/flax/summarization/run_summarization_flax.py  |  2 +-
 examples/flax/text-classification/run_flax_glue.py     |  2 +-
 examples/flax/token-classification/run_flax_ner.py     |  2 +-
 examples/flax/vision/run_image_classification.py       |  2 +-
 .../audio-classification/run_audio_classification.py   |  2 +-
 examples/pytorch/contrastive-image-text/run_clip.py    |  2 +-
 .../image-classification/run_image_classification.py   |  2 +-
 examples/pytorch/image-pretraining/run_mae.py          |  2 +-
 examples/pytorch/image-pretraining/run_mim.py          |  2 +-
 examples/pytorch/language-modeling/run_clm.py          |  2 +-
 examples/pytorch/language-modeling/run_mlm.py          |  2 +-
 examples/pytorch/language-modeling/run_plm.py          |  2 +-
 examples/pytorch/multiple-choice/run_swag.py           |  2 +-
 examples/pytorch/question-answering/run_qa.py          |  2 +-
 .../pytorch/question-answering/run_qa_beam_search.py   |  2 +-
 examples/pytorch/question-answering/run_seq2seq_qa.py  |  2 +-
 .../semantic-segmentation/run_semantic_segmentation.py |  2 +-
 .../speech-recognition/run_speech_recognition_ctc.py   |  2 +-
 .../run_speech_recognition_seq2seq.py                  |  2 +-
 examples/pytorch/summarization/run_summarization.py    |  2 +-
 examples/pytorch/text-classification/run_glue.py       |  2 +-
 examples/pytorch/text-classification/run_xnli.py       |  2 +-
 examples/pytorch/token-classification/run_ner.py       |  2 +-
 examples/pytorch/translation/run_translation.py        |  2 +-
 .../research_projects/layoutlmv3/run_funsd_cord.py     |  2 +-
 examples/research_projects/mlm_wwm/run_mlm_wwm.py      |  2 +-
 .../quantization-qdqbert/run_quant_qa.py               |  2 +-
 .../run_speech_recognition_ctc_bnb.py                  |  2 +-
 .../run_speech_recognition_ctc_streaming.py            |  2 +-
 .../research_projects/tapex/run_tabfact_with_tapex.py  |  2 +-
 .../research_projects/tapex/run_wikisql_with_tapex.py  |  2 +-
 .../tapex/run_wikitablequestions_with_tapex.py         |  2 +-
 examples/research_projects/xtreme-s/run_xtreme_s.py    |  2 +-
 examples/tensorflow/language-modeling/run_clm.py       |  2 +-
 examples/tensorflow/language-modeling/run_mlm.py       |  2 +-
 examples/tensorflow/multiple-choice/run_swag.py        |  2 +-
 examples/tensorflow/question-answering/run_qa.py       |  2 +-
 examples/tensorflow/summarization/run_summarization.py |  2 +-
 examples/tensorflow/text-classification/run_glue.py    |  2 +-
 .../text-classification/run_text_classification.py     |  2 +-
 examples/tensorflow/token-classification/run_ner.py    |  2 +-
 examples/tensorflow/translation/run_translation.py     |  2 +-
 scripts/tatoeba/README.md                              |  2 +-
 scripts/tatoeba/upload_models.sh                       |  2 +-
 src/transformers/commands/user.py                      |  5 +----
 src/transformers/configuration_utils.py                |  2 +-
 src/transformers/dynamic_module_utils.py               |  4 ++--
 src/transformers/feature_extraction_utils.py           |  2 +-
 src/transformers/modeling_tf_utils.py                  |  6 +++---
 src/transformers/modeling_utils.py                     |  2 +-
 .../models/auto/feature_extraction_auto.py             |  4 ++--
 src/transformers/models/auto/processing_auto.py        |  2 +-
 src/transformers/models/auto/tokenization_auto.py      |  2 +-
 src/transformers/pipelines/__init__.py                 |  2 +-
 src/transformers/tokenization_utils_base.py            |  2 +-
 src/transformers/utils/hub.py                          | 10 +++++-----
 .../run_{{cookiecutter.example_shortcut}}.py           |  2 +-
 .../scripts/pytorch/run_glue_model_parallelism.py      |  2 +-
 64 files changed, 72 insertions(+), 75 deletions(-)

diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index 4552defb8efc45..348a719857830a 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -186,7 +186,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index 5c8bf1bbc45dda..6396f4ced99695 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -172,7 +172,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 5fe786da7cc5ad..1a0428fdd67039 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -173,7 +173,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index f3f3c324ecfea6..65f6a2285d9c34 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -172,7 +172,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index a2906c410879b9..0030fc8da66a57 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -172,7 +172,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 0873b19413bfea..1b951e35839816 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -159,7 +159,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index 856fd6fdb7b36a..c193fe0bc3745a 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -186,7 +186,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 7f5524dbb437d6..e0dfab2f52e994 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -105,7 +105,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 0a66b5f1990bc9..ad68c0997fed81 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -153,7 +153,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 305dd3ac205f0c..3de3c977ab1d46 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -162,7 +162,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index 6c2a6cb8803976..9ebd4fb00759f5 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -156,7 +156,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 22b420d856173c..d3c5355f9d07cf 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -90,7 +90,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index f8c2c95f59592e..2d26e42604da03 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -145,7 +145,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index aa8de32d8cb2ed..3ac4106b11acbf 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -137,7 +137,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index f60b21600832e2..7626e8be363253 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -157,7 +157,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 53052d7671e061..ca992c04562e5e 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -114,7 +114,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index dcc8bcd3cd955d..b635a7aea69881 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -111,7 +111,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 15ff8eb45f6ebb..4a885ee49661fd 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -99,7 +99,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index 5771165cafeb8f..f9df919e1f92da 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -83,7 +83,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index cddcb4891beff6..54db2b7bb12d66 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -83,7 +83,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 1c389e43f37759..ce110ae3646362 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -82,7 +82,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index c3c85b31da2f0d..8ffe114dbb8644 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -83,7 +83,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 92d07f8f9199bc..bc1bfb2c1c0945 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -246,7 +246,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index fdbed2b3ab09a4..36efb44138d9a6 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -233,7 +233,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "If :obj:`True`, will use the token generated when running"
-                ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
+                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
             )
         },
     )
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 0ce8ff05508ea9..015c1f0a653222 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -89,7 +89,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 78d5b79ca4274a..5d6d5d5c771b3a 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -103,7 +103,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 556e1f3bbe66ef..49af0c85568c9b 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -192,7 +192,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 2450d24e3d0169..d4cfc3a77d0b6d 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -156,7 +156,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 13993e58a450f1..9000b5006e03fa 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -83,7 +83,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index a519fa17533591..af1868b25aad35 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -93,7 +93,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/research_projects/layoutlmv3/run_funsd_cord.py b/examples/research_projects/layoutlmv3/run_funsd_cord.py
index 66be61dffccf20..866f9a9c1b1163 100644
--- a/examples/research_projects/layoutlmv3/run_funsd_cord.py
+++ b/examples/research_projects/layoutlmv3/run_funsd_cord.py
@@ -81,7 +81,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/research_projects/mlm_wwm/run_mlm_wwm.py b/examples/research_projects/mlm_wwm/run_mlm_wwm.py
index 0afa4135537a85..f14ad5adfeff16 100644
--- a/examples/research_projects/mlm_wwm/run_mlm_wwm.py
+++ b/examples/research_projects/mlm_wwm/run_mlm_wwm.py
@@ -101,7 +101,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/research_projects/quantization-qdqbert/run_quant_qa.py b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
index 97eece4c1d0ac9..5008197b8b845d 100755
--- a/examples/research_projects/quantization-qdqbert/run_quant_qa.py
+++ b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
@@ -84,7 +84,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
index afa3397eb43000..5294e6a4a9aef9 100755
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -231,7 +231,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "If :obj:`True`, will use the token generated when running"
-                ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
+                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
             )
         },
     )
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
index 57f54048a52330..8add8fd20a72d9 100644
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
@@ -234,7 +234,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "If :obj:`True`, will use the token generated when running"
-                ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
+                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
             )
         },
     )
diff --git a/examples/research_projects/tapex/run_tabfact_with_tapex.py b/examples/research_projects/tapex/run_tabfact_with_tapex.py
index 19c21c33948edb..23d094f8992a63 100644
--- a/examples/research_projects/tapex/run_tabfact_with_tapex.py
+++ b/examples/research_projects/tapex/run_tabfact_with_tapex.py
@@ -175,7 +175,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py
index 7573893629c6d6..1d402fa7e8f0e9 100644
--- a/examples/research_projects/tapex/run_wikisql_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py
@@ -104,7 +104,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
index 7ffa8f5f91cc43..6f93f9b5166929 100644
--- a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
@@ -102,7 +102,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
index d3e4f5cb38abf9..16fc1ac8a39c32 100644
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -287,7 +287,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "If :obj:`True`, will use the token generated when running"
-                ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
+                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
             )
         },
     )
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index 46c8d339d970c3..3f12683d10d997 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -114,7 +114,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index 46b27dab662519..b421ed8e669c15 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -115,7 +115,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index b09b0e5598f514..6ba35bd0fd2023 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -157,7 +157,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index bd233f378a4dc9..91293aefb35f55 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -80,7 +80,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 5d0737fdeffbb9..6d4cf99e6782f8 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -101,7 +101,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index fe7ef66ece129c..9fb0b3f8e43482 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -183,7 +183,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index 210a30344dbc0e..b5d19032971c5b 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -173,7 +173,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index cd4eea6feeb6dc..caa47e115a4bfa 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -83,7 +83,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 6e12288fd44f52..7f5eb9eb9defb7 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -95,7 +95,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },
diff --git a/scripts/tatoeba/README.md b/scripts/tatoeba/README.md
index b86caf51d725b0..7c492ec4f46e2e 100644
--- a/scripts/tatoeba/README.md
+++ b/scripts/tatoeba/README.md
@@ -57,7 +57,7 @@ To upload all converted models,
 2. Login to `transformers-cli`
 
 ```bash
-transformers-cli login
+huggingface-cli login
 ```
 
 3. Run the `upload_models` script
diff --git a/scripts/tatoeba/upload_models.sh b/scripts/tatoeba/upload_models.sh
index 07c21edcbd519e..536eb5bc68c4c4 100755
--- a/scripts/tatoeba/upload_models.sh
+++ b/scripts/tatoeba/upload_models.sh
@@ -2,7 +2,7 @@
 
 for FILE in converted/*; do 
   model_name=`basename $FILE`
-  transformers-cli repo create $model_name -y
+  huggingface-cli repo create $model_name -y
   git clone https://huggingface.co/Helsinki-NLP/$model_name
   mv $FILE/* $model_name/
   cd $model_name
diff --git a/src/transformers/commands/user.py b/src/transformers/commands/user.py
index 2f073235d25648..938f4c8ea8b616 100644
--- a/src/transformers/commands/user.py
+++ b/src/transformers/commands/user.py
@@ -22,9 +22,6 @@
 from . import BaseTransformersCLICommand
 
 
-UPLOAD_MAX_FILES = 15
-
-
 class UserCommands(BaseTransformersCLICommand):
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
@@ -105,7 +102,7 @@ class LoginCommand(BaseUserCommand):
     def run(self):
         print(
             ANSI.red(
-                "ERROR! `transformers-cli login` uses an outdated login mechanism "
+                "ERROR! `huggingface-cli login` uses an outdated login mechanism "
                 "that is not compatible with the Hugging Face Hub backend anymore. "
                 "Please use `huggingface-cli login instead."
             )
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index b10475127b4fce..b924cec9ae021c 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -463,7 +463,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
             use_auth_token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 7baafd214c2558..da1434067cbdf8 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -195,7 +195,7 @@ def get_cached_module_file(
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
             git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -345,7 +345,7 @@ def get_class_from_dynamic_module(
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
         use_auth_token (`str` or `bool`, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
             git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index ec68f355191c1d..394d67a8c5a1a7 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -251,7 +251,7 @@ def from_pretrained(
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
             use_auth_token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 1a63d32e4196a0..354bd9592f30cd 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2096,7 +2096,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Whether or not to only look at local files (e.g., not try doanloading the model).
             use_auth_token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -2472,8 +2472,8 @@ def push_to_hub(
                 Whether or not the repository created should be private (requires a paying subscription).
             use_auth_token (`bool` or `str`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`). Will default to `True` if
-                `repo_url` is not specified.
+                when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+                is not specified.
             max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
                 Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
                 will then be each of size lower than this size. If expressed as a string, needs to be digits followed
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 8709ec66365c66..771f1d2d5d84c0 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1659,7 +1659,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Whether or not to only look at local files (i.e., do not try to download the model).
             use_auth_token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index ed526369df4f38..db581d03d8fb7e 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -142,7 +142,7 @@ def get_feature_extractor_config(
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
             git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -247,7 +247,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
             use_auth_token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index d81dd19ea23dde..aed7b4b9761373 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -135,7 +135,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
             use_auth_token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 7a2dc2941fdd09..d8759fd4e7842e 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -357,7 +357,7 @@ def get_tokenizer_config(
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
             git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index d2a4b663801d78..104726bbd8cc7a 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -505,7 +505,7 @@ def pipeline(
             Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
         device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
             Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
             `device_map="auto"` to compute the most optimized `device_map` automatically. [More
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index fc1c0ff8da3b32..91537ef46cc864 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1596,7 +1596,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
             use_auth_token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
             local_files_only (`bool`, *optional*, defaults to `False`):
                 Whether or not to only rely on local files and not to attempt to download any files.
             revision (`str`, *optional*, defaults to `"main"`):
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 2488ab8f690865..1aa086da6721ec 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -716,7 +716,7 @@ def cached_file(
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
             git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -870,7 +870,7 @@ def get_file_from_repo(
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
             git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -983,7 +983,7 @@ def get_list_of_files(
             identifier allowed by git.
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
         local_files_only (`bool`, *optional*, defaults to `False`):
             Whether or not to only rely on local files and not to attempt to download any files.
 
@@ -1161,8 +1161,8 @@ def push_to_hub(
                 Whether or not the repository created should be private (requires a paying subscription).
             use_auth_token (`bool` or `str`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`). Will default to `True` if
-                `repo_url` is not specified.
+                when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+                is not specified.
             max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
                 Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
                 will then be each of size lower than this size. If expressed as a string, needs to be digits followed
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index f07029ec242caa..e7a622edd71527 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -118,7 +118,7 @@ class ModelArguments:
     use_auth_token: bool = field(
         default=False,
         metadata={
-            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "help": "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
             "with private models)."
         },
     )
diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
index 534b1656d10f3e..01185fdabac527 100644
--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -181,7 +181,7 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                 "with private models)."
             )
         },

From a61015538d5b7b8b42f164c49be3b2160299781e Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Mon, 8 Aug 2022 14:08:11 +0200
Subject: [PATCH 071/162] Add seed setting to image classification example
 (#18519)

---
 .../pytorch/image-classification/run_image_classification.py  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 2d26e42604da03..28000015ab173a 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -43,6 +43,7 @@
     HfArgumentParser,
     Trainer,
     TrainingArguments,
+    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
@@ -214,6 +215,9 @@ def main():
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
             )
 
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
     # Initialize our dataset and prepare it for the 'image-classification' task.
     if data_args.dataset_name is not None:
         dataset = load_dataset(

From 2f493d547ae0c9aa00a012539e7bdaa1df384622 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 8 Aug 2022 14:25:56 +0200
Subject: [PATCH 072/162] [DX fix] Fixing QA pipeline streaming a dataset.
 (#18516)

* [DX fix] Fixing QA pipeline streaming a dataset.

QuestionAnsweringArgumentHandler would iterate over the whole dataset
effectively killing all properties of the pipeline.
This restores nice properties when using `Dataset` or `Generator` since
those are meant to be consumed lazily.

* Handling TF better.
---
 .../pipelines/question_answering.py             | 17 ++++++++++++++++-
 .../test_pipelines_question_answering.py        | 12 ++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 0f5fbf0370e708..d58762035ef7f8 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -1,3 +1,4 @@
+import types
 import warnings
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
@@ -22,8 +23,11 @@
 
     from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
 
+    Dataset = None
+
 if is_torch_available():
     import torch
+    from torch.utils.data import Dataset
 
     from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
 
@@ -82,6 +86,11 @@ def __call__(self, *args, **kwargs):
         else:
             raise ValueError(f"Unknown arguments {kwargs}")
 
+        # When user is sending a generator we need to trust it's a valid example
+        generator_types = (types.GeneratorType, Dataset) if Dataset is not None else (types.GeneratorType,)
+        if isinstance(inputs, generator_types):
+            return inputs
+
         # Normalize inputs
         if isinstance(inputs, dict):
             inputs = [inputs]
@@ -245,12 +254,18 @@ def __call__(self, *args, **kwargs):
         """
 
         # Convert inputs to features
+
         examples = self._args_parser(*args, **kwargs)
-        if len(examples) == 1:
+        if isinstance(examples, (list, tuple)) and len(examples) == 1:
             return super().__call__(examples[0], **kwargs)
         return super().__call__(examples, **kwargs)
 
     def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
+        # XXX: This is specal, args_parser will not handle anything generator or dataset like
+        # For those we expect user to send a simple valid example either directly as a SquadExample or simple dict.
+        # So we still need a little sanitation here.
+        if isinstance(example, dict):
+            example = SquadExample(None, example["question"], example["context"], None, None, None)
 
         if max_seq_len is None:
             max_seq_len = min(self.tokenizer.model_max_length, 384)
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index f34237612c11a9..c3a0da2f2b5e9a 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -125,6 +125,18 @@ def test_small_model_pt(self):
 
         self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"})
 
+    @require_torch
+    def test_small_model_pt_iterator(self):
+        # https://github.com/huggingface/transformers/issues/18510
+        pipe = pipeline(model="sshleifer/tiny-distilbert-base-cased-distilled-squad", batch_size=16, framework="pt")
+
+        def data():
+            for i in range(10):
+                yield {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}
+
+        for outputs in pipe(data()):
+            self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"})
+
     @require_torch
     def test_small_model_pt_softmax_trick(self):
         question_answerer = pipeline(

From 80c33f8d07c1078e06f8601e3332fdb30154271f Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 8 Aug 2022 08:48:10 -0400
Subject: [PATCH 073/162] Clean up hub (#18497)

* Clean up utils.hub

* Remove imports

* More fixes

* Last fix
---
 src/transformers/__init__.py                  |   2 -
 .../convert_pytorch_checkpoint_to_tf2.py      |  19 +-
 src/transformers/dynamic_module_utils.py      |  18 +-
 src/transformers/file_utils.py                |   9 -
 src/transformers/modelcard.py                 |  77 +--
 src/transformers/modeling_tf_utils.py         |   4 +-
 src/transformers/modeling_utils.py            |   4 +-
 src/transformers/models/rag/retrieval_rag.py  |  15 +-
 .../transfo_xl/tokenization_transfo_xl.py     |  17 +-
 src/transformers/pipelines/__init__.py        |   4 +-
 src/transformers/utils/__init__.py            |   9 -
 src/transformers/utils/hub.py                 | 535 +-----------------
 tests/utils/test_file_utils.py                |  61 --
 utils/check_repo.py                           |   1 -
 14 files changed, 67 insertions(+), 708 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d58bc6f1847eb1..d56c70c024e6ed 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -441,7 +441,6 @@
         "TensorType",
         "add_end_docstrings",
         "add_start_docstrings",
-        "cached_path",
         "is_apex_available",
         "is_datasets_available",
         "is_faiss_available",
@@ -3216,7 +3215,6 @@
         TensorType,
         add_end_docstrings,
         add_start_docstrings,
-        cached_path,
         is_apex_available,
         is_datasets_available,
         is_faiss_available,
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index db7484f4b22701..6a05e40f0f804d 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -38,7 +38,6 @@
     T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
     TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
     WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    WEIGHTS_NAME,
     XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
     XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
     XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -91,11 +90,10 @@
     XLMConfig,
     XLMRobertaConfig,
     XLNetConfig,
-    cached_path,
     is_torch_available,
     load_pytorch_checkpoint_in_tf2_model,
 )
-from .utils import hf_bucket_url, logging
+from .utils import CONFIG_NAME, WEIGHTS_NAME, cached_file, logging
 
 
 if is_torch_available():
@@ -311,7 +309,7 @@ def convert_pt_checkpoint_to_tf(
 
     # Initialise TF model
     if config_file in aws_config_map:
-        config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models)
+        config_file = cached_file(config_file, CONFIG_NAME, force_download=not use_cached_models)
     config = config_class.from_json_file(config_file)
     config.output_hidden_states = True
     config.output_attentions = True
@@ -320,8 +318,9 @@ def convert_pt_checkpoint_to_tf(
 
     # Load weights from tf checkpoint
     if pytorch_checkpoint_path in aws_config_map.keys():
-        pytorch_checkpoint_url = hf_bucket_url(pytorch_checkpoint_path, filename=WEIGHTS_NAME)
-        pytorch_checkpoint_path = cached_path(pytorch_checkpoint_url, force_download=not use_cached_models)
+        pytorch_checkpoint_path = cached_file(
+            pytorch_checkpoint_path, WEIGHTS_NAME, force_download=not use_cached_models
+        )
     # Load PyTorch checkpoint in tf2 model:
     tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
 
@@ -395,14 +394,14 @@ def convert_all_pt_checkpoints_to_tf(
             print("-" * 100)
 
             if config_shortcut_name in aws_config_map:
-                config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models)
+                config_file = cached_file(config_shortcut_name, CONFIG_NAME, force_download=not use_cached_models)
             else:
-                config_file = cached_path(config_shortcut_name, force_download=not use_cached_models)
+                config_file = config_shortcut_name
 
             if model_shortcut_name in aws_model_maps:
-                model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models)
+                model_file = cached_file(model_shortcut_name, WEIGHTS_NAME, force_download=not use_cached_models)
             else:
-                model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
+                model_file = model_shortcut_name
 
             if os.path.isfile(model_shortcut_name):
                 model_shortcut_name = "converted_model"
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index da1434067cbdf8..0c2067cf2e53dd 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -24,14 +24,7 @@
 
 from huggingface_hub import HfFolder, model_info
 
-from .utils import (
-    HF_MODULES_CACHE,
-    TRANSFORMERS_DYNAMIC_MODULE_NAME,
-    cached_path,
-    hf_bucket_url,
-    is_offline_mode,
-    logging,
-)
+from .utils import HF_MODULES_CACHE, TRANSFORMERS_DYNAMIC_MODULE_NAME, cached_file, is_offline_mode, logging
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -219,18 +212,15 @@ def get_cached_module_file(
     # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
     pretrained_model_name_or_path = str(pretrained_model_name_or_path)
     if os.path.isdir(pretrained_model_name_or_path):
-        module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
         submodule = "local"
     else:
-        module_file_or_url = hf_bucket_url(
-            pretrained_model_name_or_path, filename=module_file, revision=revision, mirror=None
-        )
         submodule = pretrained_model_name_or_path.replace("/", os.path.sep)
 
     try:
         # Load from URL or cache if already cached
-        resolved_module_file = cached_path(
-            module_file_or_url,
+        resolved_module_file = cached_file(
+            pretrained_model_name_or_path,
+            module_file,
             cache_dir=cache_dir,
             force_download=force_download,
             proxies=proxies,
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 94f9a3326d20e4..aa3681e057bb9d 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -69,20 +69,14 @@
     add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    cached_path,
     cached_property,
     copy_func,
     default_cache_path,
     define_sagemaker_information,
-    filename_to_url,
     get_cached_models,
     get_file_from_repo,
-    get_from_cache,
     get_full_repo_name,
-    get_list_of_files,
     has_file,
-    hf_bucket_url,
-    http_get,
     http_user_agent,
     is_apex_available,
     is_coloredlogs_available,
@@ -94,7 +88,6 @@
     is_in_notebook,
     is_ipex_available,
     is_librosa_available,
-    is_local_clone,
     is_offline_mode,
     is_onnx_available,
     is_pandas_available,
@@ -105,7 +98,6 @@
     is_pyctcdecode_available,
     is_pytesseract_available,
     is_pytorch_quantization_available,
-    is_remote_url,
     is_rjieba_available,
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
@@ -141,5 +133,4 @@
     torch_only_method,
     torch_required,
     torch_version,
-    url_to_filename,
 )
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index c5d07e11473778..dc842c2abbf72c 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -43,15 +43,10 @@
 )
 from .training_args import ParallelMode
 from .utils import (
-    CONFIG_NAME,
     MODEL_CARD_NAME,
-    TF2_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    cached_path,
-    hf_bucket_url,
+    cached_file,
     is_datasets_available,
     is_offline_mode,
-    is_remote_url,
     is_tf_available,
     is_tokenizers_available,
     is_torch_available,
@@ -153,11 +148,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
 
-            find_from_standard_name: (*optional*) boolean, default True:
-                If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them
-                with our standard modelcard filename. Can be used to directly feed a model/config url and access the
-                colocated modelcard.
-
             return_unused_kwargs: (*optional*) bool:
 
                 - If False, then this function returns just the final model card object.
@@ -168,21 +158,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         Examples:
 
         ```python
-        modelcard = ModelCard.from_pretrained(
-            "bert-base-uncased"
-        )  # Download model card from huggingface.co and cache.
-        modelcard = ModelCard.from_pretrained(
-            "./test/saved_model/"
-        )  # E.g. model card was saved using *save_pretrained('./test/saved_model/')*
+        # Download model card from huggingface.co and cache.
+        modelcard = ModelCard.from_pretrained("bert-base-uncased")
+        # Model card was saved using *save_pretrained('./test/saved_model/')*
+        modelcard = ModelCard.from_pretrained("./test/saved_model/")
         modelcard = ModelCard.from_pretrained("./test/saved_model/modelcard.json")
         modelcard = ModelCard.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
         ```"""
-        # This imports every model so let's do it dynamically here.
-        from transformers.models.auto.configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
-
         cache_dir = kwargs.pop("cache_dir", None)
         proxies = kwargs.pop("proxies", None)
-        find_from_standard_name = kwargs.pop("find_from_standard_name", True)
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
         from_pipeline = kwargs.pop("_from_pipeline", None)
 
@@ -190,37 +174,30 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
 
-        if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-            # For simplicity we use the same pretrained url than the configuration files
-            # but with a different suffix (modelcard.json). This suffix is replaced below.
-            model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            model_card_file = pretrained_model_name_or_path
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isfile(pretrained_model_name_or_path):
+            resolved_model_card_file = pretrained_model_name_or_path
+            is_local = True
         else:
-            model_card_file = hf_bucket_url(pretrained_model_name_or_path, filename=MODEL_CARD_NAME, mirror=None)
-
-        if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-            model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
-            model_card_file = model_card_file.replace(WEIGHTS_NAME, MODEL_CARD_NAME)
-            model_card_file = model_card_file.replace(TF2_WEIGHTS_NAME, MODEL_CARD_NAME)
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_model_card_file = cached_path(
-                model_card_file, cache_dir=cache_dir, proxies=proxies, user_agent=user_agent
-            )
-            if resolved_model_card_file == model_card_file:
-                logger.info(f"loading model card file {model_card_file}")
-            else:
-                logger.info(f"loading model card file {model_card_file} from cache at {resolved_model_card_file}")
-            # Load model card
-            modelcard = cls.from_json_file(resolved_model_card_file)
+            try:
+                # Load from URL or cache if already cached
+                resolved_model_card_file = cached_file(
+                    pretrained_model_name_or_path,
+                    filename=MODEL_CARD_NAME,
+                    cache_dir=cache_dir,
+                    proxies=proxies,
+                    user_agent=user_agent,
+                )
+                if is_local:
+                    logger.info(f"loading model card file {resolved_model_card_file}")
+                else:
+                    logger.info(f"loading model card file {MODEL_CARD_NAME} from cache at {resolved_model_card_file}")
+                # Load model card
+                modelcard = cls.from_json_file(resolved_model_card_file)
 
-        except (EnvironmentError, json.JSONDecodeError):
-            # We fall back on creating an empty model card
-            modelcard = cls()
+            except (EnvironmentError, json.JSONDecodeError):
+                # We fall back on creating an empty model card
+                modelcard = cls()
 
         # Update model card with kwargs if needed
         to_remove = []
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 354bd9592f30cd..703440d80ad71b 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2156,7 +2156,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         use_auth_token = kwargs.pop("use_auth_token", None)
         revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
-        mirror = kwargs.pop("mirror", None)
+        _ = kwargs.pop("mirror", None)
         load_weight_prefix = kwargs.pop("load_weight_prefix", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
@@ -2270,7 +2270,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         # message.
                         has_file_kwargs = {
                             "revision": revision,
-                            "mirror": mirror,
                             "proxies": proxies,
                             "use_auth_token": use_auth_token,
                         }
@@ -2321,7 +2320,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 use_auth_token=use_auth_token,
                 user_agent=user_agent,
                 revision=revision,
-                mirror=mirror,
             )
 
         config.name_or_path = pretrained_model_name_or_path
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 771f1d2d5d84c0..2a86128c221bec 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1784,7 +1784,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         use_auth_token = kwargs.pop("use_auth_token", None)
         revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
-        mirror = kwargs.pop("mirror", None)
+        _ = kwargs.pop("mirror", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         _fast_init = kwargs.pop("_fast_init", True)
@@ -1955,7 +1955,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         # message.
                         has_file_kwargs = {
                             "revision": revision,
-                            "mirror": mirror,
                             "proxies": proxies,
                             "use_auth_token": use_auth_token,
                         }
@@ -2012,7 +2011,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 use_auth_token=use_auth_token,
                 user_agent=user_agent,
                 revision=revision,
-                mirror=mirror,
                 subfolder=subfolder,
             )
 
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index 7a3c5635f24f9b..797c1a7332acf7 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -23,7 +23,7 @@
 
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import cached_path, is_datasets_available, is_faiss_available, is_remote_url, logging, requires_backends
+from ...utils import cached_file, is_datasets_available, is_faiss_available, logging, requires_backends
 from .configuration_rag import RagConfig
 from .tokenization_rag import RagTokenizer
 
@@ -111,22 +111,21 @@ def __init__(self, vector_size, index_path):
         self._index_initialized = False
 
     def _resolve_path(self, index_path, filename):
-        assert os.path.isdir(index_path) or is_remote_url(index_path), "Please specify a valid `index_path`."
-        archive_file = os.path.join(index_path, filename)
+        is_local = os.path.isdir(index_path)
         try:
             # Load from URL or cache if already cached
-            resolved_archive_file = cached_path(archive_file)
+            resolved_archive_file = cached_file(index_path, filename)
         except EnvironmentError:
             msg = (
-                f"Can't load '{archive_file}'. Make sure that:\n\n"
+                f"Can't load '{filename}'. Make sure that:\n\n"
                 f"- '{index_path}' is a correct remote path to a directory containing a file named {filename}\n\n"
                 f"- or '{index_path}' is the correct path to a directory containing a file named {filename}.\n\n"
             )
             raise EnvironmentError(msg)
-        if resolved_archive_file == archive_file:
-            logger.info(f"loading file {archive_file}")
+        if is_local:
+            logger.info(f"loading file {resolved_archive_file}")
         else:
-            logger.info(f"loading file {archive_file} from cache at {resolved_archive_file}")
+            logger.info(f"loading file {filename} from cache at {resolved_archive_file}")
         return resolved_archive_file
 
     def _load_passages(self):
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
index 92bcfe83199e8d..5b284a219a4753 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -29,7 +29,7 @@
 
 from ...tokenization_utils import PreTrainedTokenizer
 from ...utils import (
-    cached_path,
+    cached_file,
     is_sacremoses_available,
     is_torch_available,
     logging,
@@ -681,24 +681,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs,
         Instantiate a pre-processed corpus.
         """
         vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
-            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
         # redirect to the cache, if necessary
         try:
-            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
+            resolved_corpus_file = cached_file(pretrained_model_name_or_path, CORPUS_NAME, cache_dir=cache_dir)
         except EnvironmentError:
             logger.error(
                 f"Corpus '{pretrained_model_name_or_path}' was not found in corpus list"
                 f" ({', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys())}. We assumed '{pretrained_model_name_or_path}'"
-                f" was a path or url but couldn't find files {corpus_file} at this path or url."
+                f" was a path or url but couldn't find files {CORPUS_NAME} at this path or url."
             )
             return None
-        if resolved_corpus_file == corpus_file:
-            logger.info(f"loading corpus file {corpus_file}")
+        if is_local:
+            logger.info(f"loading corpus file {resolved_corpus_file}")
         else:
-            logger.info(f"loading corpus file {corpus_file} from cache at {resolved_corpus_file}")
+            logger.info(f"loading corpus file {CORPUS_NAME} from cache at {resolved_corpus_file}")
 
         # Instantiate tokenizer.
         corpus = cls(*inputs, **kwargs)
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 104726bbd8cc7a..dfa75768d8f811 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -25,6 +25,8 @@
 
 from numpy import isin
 
+from huggingface_hub.file_download import http_get
+
 from ..configuration_utils import PretrainedConfig
 from ..dynamic_module_utils import get_class_from_dynamic_module
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
@@ -33,7 +35,7 @@
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
 from ..tokenization_utils_fast import PreTrainedTokenizerFast
-from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, http_get, is_tf_available, is_torch_available, logging
+from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, is_tf_available, is_torch_available, logging
 from .audio_classification import AudioClassificationPipeline
 from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
 from .base import (
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index b810be45636e08..c778489e5e3995 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -61,25 +61,16 @@
     RepositoryNotFoundError,
     RevisionNotFoundError,
     cached_file,
-    cached_path,
     default_cache_path,
     define_sagemaker_information,
-    filename_to_url,
     get_cached_models,
     get_file_from_repo,
-    get_from_cache,
     get_full_repo_name,
-    get_list_of_files,
     has_file,
-    hf_bucket_url,
-    http_get,
     http_user_agent,
-    is_local_clone,
     is_offline_mode,
-    is_remote_url,
     move_cache,
     send_example_telemetry,
-    url_to_filename,
 )
 from .import_utils import (
     ENV_VARS_TRUE_AND_AUTO_VALUES,
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 1aa086da6721ec..5f9421912c49d7 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -14,44 +14,32 @@
 """
 Hub utilities: utilities related to download and cache models
 """
-import copy
-import fnmatch
-import io
 import json
 import os
 import re
 import shutil
-import subprocess
 import sys
-import tarfile
-import tempfile
 import traceback
 import warnings
 from contextlib import contextmanager
-from functools import partial
-from hashlib import sha256
 from pathlib import Path
-from typing import BinaryIO, Dict, List, Optional, Tuple, Union
-from urllib.parse import urlparse
+from typing import Dict, List, Optional, Tuple, Union
 from uuid import uuid4
-from zipfile import ZipFile, is_zipfile
 
 import huggingface_hub
 import requests
-from filelock import FileLock
 from huggingface_hub import (
     CommitOperationAdd,
     HfFolder,
     create_commit,
     create_repo,
     hf_hub_download,
-    list_repo_files,
+    hf_hub_url,
     whoami,
 )
 from huggingface_hub.constants import HUGGINGFACE_HEADER_X_LINKED_ETAG, HUGGINGFACE_HEADER_X_REPO_COMMIT
 from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
 from requests.exceptions import HTTPError
-from requests.models import Response
 from transformers.utils.logging import tqdm
 
 from . import __version__, logging
@@ -128,93 +116,6 @@ def is_offline_mode():
 HUGGINGFACE_CO_EXAMPLES_TELEMETRY = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/api/telemetry/examples"
 
 
-def is_remote_url(url_or_filename):
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https")
-
-
-def hf_bucket_url(
-    model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None
-) -> str:
-    """
-    Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting
-    to Cloudfront (a Content Delivery Network, or CDN) for large files.
-
-    Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
-    bandwidth costs).
-
-    Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here
-    because we migrated to a git-based versioning system on huggingface.co, so we now store the files on S3/Cloudfront
-    in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache
-    can't ever be stale.
-
-    In terms of client-side caching from this library, we base our caching on the objects' ETag. An object' ETag is:
-    its sha1 if stored in git, or its sha256 if stored in git-lfs. Files cached locally from transformers before v3.5.0
-    are not shared with those new files, because the cached file's name contains a hash of the url (which changed).
-    """
-    if subfolder is not None:
-        filename = f"{subfolder}/{filename}"
-
-    if mirror:
-        if mirror in ["tuna", "bfsu"]:
-            raise ValueError("The Tuna and BFSU mirrors are no longer available. Try removing the mirror argument.")
-        legacy_format = "/" not in model_id
-        if legacy_format:
-            return f"{mirror}/{model_id}-{filename}"
-        else:
-            return f"{mirror}/{model_id}/{filename}"
-
-    if revision is None:
-        revision = "main"
-    return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
-
-
-def url_to_filename(url: str, etag: Optional[str] = None) -> str:
-    """
-    Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
-    delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
-    identify it as a HDF5 file (see
-    https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
-    """
-    url_bytes = url.encode("utf-8")
-    filename = sha256(url_bytes).hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        filename += "." + sha256(etag_bytes).hexdigest()
-
-    if url.endswith(".h5"):
-        filename += ".h5"
-
-    return filename
-
-
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be `None`) stored for *filename*. Raise `EnvironmentError` if *filename* or its
-    stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError(f"file {cache_path} not found")
-
-    meta_path = cache_path + ".json"
-    if not os.path.exists(meta_path):
-        raise EnvironmentError(f"file {meta_path} not found")
-
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata["url"]
-    etag = metadata["etag"]
-
-    return url, etag
-
-
 def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]:
     """
     Returns a list of tuples representing model binaries that are cached locally. Each tuple has shape `(model_url,
@@ -248,108 +149,6 @@ def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]:
     return cached_models
 
 
-def cached_path(
-    url_or_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    user_agent: Union[Dict, str, None] = None,
-    extract_compressed_file=False,
-    force_extract=False,
-    use_auth_token: Union[bool, str, None] = None,
-    local_files_only=False,
-) -> Optional[str]:
-    """
-    Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file
-    and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and
-    then return the path
-
-    Args:
-        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
-        force_download: if True, re-download the file even if it's already cached in the cache dir.
-        resume_download: if True, resume the download if incompletely received file is found.
-        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
-        use_auth_token: Optional string or boolean to use as Bearer token for remote files. If True,
-            will get token from ~/.huggingface.
-        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
-            file in a folder along the archive.
-        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
-            re-extract the archive and override the folder where it was extracted.
-
-    Return:
-        Local path (string) of file or if networking is off, last version of file cached on disk.
-
-    Raises:
-        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if is_offline_mode() and not local_files_only:
-        logger.info("Offline mode: forcing local_files_only=True")
-        local_files_only = True
-
-    if is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        output_path = get_from_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            user_agent=user_agent,
-            use_auth_token=use_auth_token,
-            local_files_only=local_files_only,
-        )
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        output_path = url_or_filename
-    elif urlparse(url_or_filename).scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError(f"file {url_or_filename} not found")
-    else:
-        # Something unknown
-        raise ValueError(f"unable to parse {url_or_filename} as a URL or as a local path")
-
-    if extract_compressed_file:
-        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
-            return output_path
-
-        # Path where we extract compressed archives
-        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
-        output_dir, output_file = os.path.split(output_path)
-        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
-        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
-
-        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
-            return output_path_extracted
-
-        # Prevent parallel extractions
-        lock_path = output_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted)
-            if is_zipfile(output_path):
-                with ZipFile(output_path, "r") as zip_file:
-                    zip_file.extractall(output_path_extracted)
-                    zip_file.close()
-            elif tarfile.is_tarfile(output_path):
-                tar_file = tarfile.open(output_path)
-                tar_file.extractall(output_path_extracted)
-                tar_file.close()
-            else:
-                raise EnvironmentError(f"Archive format of {output_path} could not be identified")
-
-        return output_path_extracted
-
-    return output_path
-
-
 def define_sagemaker_information():
     try:
         instance_data = requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json()
@@ -399,234 +198,6 @@ def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
     return ua
 
 
-def _raise_for_status(response: Response):
-    """
-    Internal version of `request.raise_for_status()` that will refine a potential HTTPError.
-    """
-    if "X-Error-Code" in response.headers:
-        error_code = response.headers["X-Error-Code"]
-        if error_code == "RepoNotFound":
-            raise RepositoryNotFoundError(f"404 Client Error: Repository Not Found for url: {response.url}")
-        elif error_code == "EntryNotFound":
-            raise EntryNotFoundError(f"404 Client Error: Entry Not Found for url: {response.url}")
-        elif error_code == "RevisionNotFound":
-            raise RevisionNotFoundError(f"404 Client Error: Revision Not Found for url: {response.url}")
-
-    if response.status_code == 401:
-        # The repo was not found and the user is not Authenticated
-        raise RepositoryNotFoundError(
-            f"401 Client Error: Repository not found for url: {response.url}. "
-            "If the repo is private, make sure you are authenticated."
-        )
-
-    response.raise_for_status()
-
-
-def http_get(
-    url: str,
-    temp_file: BinaryIO,
-    proxies=None,
-    resume_size=0,
-    headers: Optional[Dict[str, str]] = None,
-    file_name: Optional[str] = None,
-):
-    """
-    Download remote file. Do not gobble up errors.
-    """
-    headers = copy.deepcopy(headers)
-    if resume_size > 0:
-        headers["Range"] = f"bytes={resume_size}-"
-    r = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    _raise_for_status(r)
-    content_length = r.headers.get("Content-Length")
-    total = resume_size + int(content_length) if content_length is not None else None
-    # `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()`
-    # and can be set using `utils.logging.enable/disable_progress_bar()`
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        unit_divisor=1024,
-        total=total,
-        initial=resume_size,
-        desc=f"Downloading {file_name}" if file_name is not None else "Downloading",
-    )
-    for chunk in r.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(
-    url: str,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    etag_timeout=10,
-    resume_download=False,
-    user_agent: Union[Dict, str, None] = None,
-    use_auth_token: Union[bool, str, None] = None,
-    local_files_only=False,
-) -> Optional[str]:
-    """
-    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
-    path to the cached file.
-
-    Return:
-        Local path (string) of file or if networking is off, last version of file cached on disk.
-
-    Raises:
-        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    os.makedirs(cache_dir, exist_ok=True)
-
-    headers = {"user-agent": http_user_agent(user_agent)}
-    if isinstance(use_auth_token, str):
-        headers["authorization"] = f"Bearer {use_auth_token}"
-    elif use_auth_token:
-        token = HfFolder.get_token()
-        if token is None:
-            raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
-        headers["authorization"] = f"Bearer {token}"
-
-    url_to_download = url
-    etag = None
-    if not local_files_only:
-        try:
-            r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
-            _raise_for_status(r)
-            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
-            # We favor a custom header indicating the etag of the linked resource, and
-            # we fallback to the regular etag header.
-            # If we don't have any of those, raise an error.
-            if etag is None:
-                raise OSError(
-                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
-                )
-            # In case of a redirect,
-            # save an extra redirect on the request.get call,
-            # and ensure we download the exact atomic version even if it changed
-            # between the HEAD and the GET (unlikely, but hey).
-            if 300 <= r.status_code <= 399:
-                url_to_download = r.headers["Location"]
-        except (
-            requests.exceptions.SSLError,
-            requests.exceptions.ProxyError,
-            RepositoryNotFoundError,
-            EntryNotFoundError,
-            RevisionNotFoundError,
-        ):
-            # Actually raise for those subclasses of ConnectionError
-            # Also raise the custom errors coming from a non existing repo/branch/file as they are caught later on.
-            raise
-        except (HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout):
-            # Otherwise, our Internet connection is down.
-            # etag is None
-            pass
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # etag is None == we don't have a connection or we passed local_files_only.
-    # try to get the last downloaded one
-    if etag is None:
-        if os.path.exists(cache_path):
-            return cache_path
-        else:
-            matching_files = [
-                file
-                for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
-                if not file.endswith(".json") and not file.endswith(".lock")
-            ]
-            if len(matching_files) > 0:
-                return os.path.join(cache_dir, matching_files[-1])
-            else:
-                # If files cannot be found and local_files_only=True,
-                # the models might've been found if local_files_only=False
-                # Notify the user about that
-                if local_files_only:
-                    fname = url.split("/")[-1]
-                    raise EntryNotFoundError(
-                        f"Cannot find the requested file ({fname}) in the cached path and outgoing traffic has been"
-                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
-                        " to False."
-                    )
-                else:
-                    raise ValueError(
-                        "Connection error, and we cannot find the requested files in the cached path."
-                        " Please try again or make sure your Internet connection is on."
-                    )
-
-    # From now on, etag is not None.
-    if os.path.exists(cache_path) and not force_download:
-        return cache_path
-
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
-
-        # If the download just completed while the lock was activated.
-        if os.path.exists(cache_path) and not force_download:
-            # Even if returning early like here, the lock will be released.
-            return cache_path
-
-        if resume_download:
-            incomplete_path = cache_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager() -> "io.BufferedWriter":
-                with open(incomplete_path, "ab") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}")
-
-            # The url_to_download might be messy, so we extract the file name from the original url.
-            file_name = url.split("/")[-1]
-            http_get(
-                url_to_download,
-                temp_file,
-                proxies=proxies,
-                resume_size=resume_size,
-                headers=headers,
-                file_name=file_name,
-            )
-
-        logger.info(f"storing {url} in cache at {cache_path}")
-        os.replace(temp_file.name, cache_path)
-
-        # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it.
-        umask = os.umask(0o666)
-        os.umask(umask)
-        os.chmod(cache_path, 0o666 & ~umask)
-
-        logger.info(f"creating metadata file for {cache_path}")
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
-
-    return cache_path
-
-
 def try_to_load_from_cache(cache_dir, repo_id, filename, revision=None):
     """
     Explores the cache to return the latest cached file for a given revision.
@@ -919,7 +490,6 @@ def has_file(
     path_or_repo: Union[str, os.PathLike],
     filename: str,
     revision: Optional[str] = None,
-    mirror: Optional[str] = None,
     proxies: Optional[Dict[str, str]] = None,
     use_auth_token: Optional[Union[bool, str]] = None,
 ):
@@ -936,7 +506,7 @@ def has_file(
     if os.path.isdir(path_or_repo):
         return os.path.isfile(os.path.join(path_or_repo, filename))
 
-    url = hf_bucket_url(path_or_repo, filename=filename, revision=revision, mirror=mirror)
+    url = hf_hub_url(path_or_repo, filename=filename, revision=revision)
 
     headers = {"user-agent": http_user_agent()}
     if isinstance(use_auth_token, str):
@@ -965,89 +535,6 @@ def has_file(
         return False
 
 
-def get_list_of_files(
-    path_or_repo: Union[str, os.PathLike],
-    revision: Optional[str] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    local_files_only: bool = False,
-) -> List[str]:
-    """
-    Gets the list of files inside `path_or_repo`.
-
-    Args:
-        path_or_repo (`str` or `os.PathLike`):
-            Can be either the id of a repo on huggingface.co or a path to a *directory*.
-        revision (`str`, *optional*, defaults to `"main"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-            identifier allowed by git.
-        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `huggingface-cli login` (stored in `~/.huggingface`).
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            Whether or not to only rely on local files and not to attempt to download any files.
-
-    <Tip warning={true}>
-
-    This API is not optimized, so calling it a lot may result in connection errors.
-
-    </Tip>
-
-    Returns:
-        `List[str]`: The list of files available in `path_or_repo`.
-    """
-    path_or_repo = str(path_or_repo)
-    # If path_or_repo is a folder, we just return what is inside (subdirectories included).
-    if os.path.isdir(path_or_repo):
-        list_of_files = []
-        for path, dir_names, file_names in os.walk(path_or_repo):
-            list_of_files.extend([os.path.join(path, f) for f in file_names])
-        return list_of_files
-
-    # Can't grab the files if we are on offline mode.
-    if is_offline_mode() or local_files_only:
-        return []
-
-    # Otherwise we grab the token and use the list_repo_files method.
-    if isinstance(use_auth_token, str):
-        token = use_auth_token
-    elif use_auth_token is True:
-        token = HfFolder.get_token()
-    else:
-        token = None
-
-    try:
-        return list_repo_files(path_or_repo, revision=revision, token=token)
-    except HTTPError as e:
-        raise ValueError(
-            f"{path_or_repo} is not a local path or a model identifier on the model Hub. Did you make a typo?"
-        ) from e
-
-
-def is_local_clone(repo_path, repo_url):
-    """
-    Checks if the folder in `repo_path` is a local clone of `repo_url`.
-    """
-    # First double-check that `repo_path` is a git repo
-    if not os.path.exists(os.path.join(repo_path, ".git")):
-        return False
-    test_git = subprocess.run("git branch".split(), cwd=repo_path)
-    if test_git.returncode != 0:
-        return False
-
-    # Then look at its remotes
-    remotes = subprocess.run(
-        "git remote -v".split(),
-        stderr=subprocess.PIPE,
-        stdout=subprocess.PIPE,
-        check=True,
-        encoding="utf-8",
-        cwd=repo_path,
-    ).stdout
-
-    return repo_url in remotes.split()
-
-
 class PushToHubMixin:
     """
     A Mixin containing the functionality to push a model or tokenizer to the hub.
@@ -1310,7 +797,6 @@ def get_checkpoint_shard_files(
     use_auth_token=None,
     user_agent=None,
     revision=None,
-    mirror=None,
     subfolder="",
 ):
     """
@@ -1343,18 +829,11 @@ def get_checkpoint_shard_files(
     # At this stage pretrained_model_name_or_path is a model identifier on the Hub
     cached_filenames = []
     for shard_filename in shard_filenames:
-        shard_url = hf_bucket_url(
-            pretrained_model_name_or_path,
-            filename=shard_filename,
-            revision=revision,
-            mirror=mirror,
-            subfolder=subfolder if len(subfolder) > 0 else None,
-        )
-
         try:
             # Load from URL
-            cached_filename = cached_path(
-                shard_url,
+            cached_filename = cached_file(
+                pretrained_model_name_or_path,
+                shard_filename,
                 cache_dir=cache_dir,
                 force_download=force_download,
                 proxies=proxies,
@@ -1362,6 +841,8 @@ def get_checkpoint_shard_files(
                 local_files_only=local_files_only,
                 use_auth_token=use_auth_token,
                 user_agent=user_agent,
+                revision=revision,
+                subfolder=subfolder,
             )
         # We have already dealt with RepositoryNotFoundError and RevisionNotFoundError when getting the index, so
         # we don't have to catch them here.
diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py
index 19adfe21dd4bf6..60676e9f7d9d37 100644
--- a/tests/utils/test_file_utils.py
+++ b/tests/utils/test_file_utils.py
@@ -26,20 +26,13 @@
 from transformers import *  # noqa F406
 from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
 from transformers.utils import (
-    CONFIG_NAME,
     FLAX_WEIGHTS_NAME,
     TF2_WEIGHTS_NAME,
     WEIGHTS_NAME,
     ContextManagers,
-    EntryNotFoundError,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-    filename_to_url,
     find_labels,
     get_file_from_repo,
-    get_from_cache,
     has_file,
-    hf_bucket_url,
     is_flax_available,
     is_tf_available,
     is_torch_available,
@@ -85,60 +78,6 @@ def test_module_spec_available(self):
 
 
 class GetFromCacheTests(unittest.TestCase):
-    def test_bogus_url(self):
-        # This lets us simulate no connection
-        # as the error raised is the same
-        # `ConnectionError`
-        url = "https://bogus"
-        with self.assertRaisesRegex(ValueError, "Connection error"):
-            _ = get_from_cache(url)
-
-    def test_file_not_found(self):
-        # Valid revision (None) but missing file.
-        url = hf_bucket_url(MODEL_ID, filename="missing.bin")
-        with self.assertRaisesRegex(EntryNotFoundError, "404 Client Error"):
-            _ = get_from_cache(url)
-
-    def test_model_not_found_not_authenticated(self):
-        # Invalid model id.
-        url = hf_bucket_url("bert-base", filename="pytorch_model.bin")
-        with self.assertRaisesRegex(RepositoryNotFoundError, "401 Client Error"):
-            _ = get_from_cache(url)
-
-    @unittest.skip("No authentication when testing against prod")
-    def test_model_not_found_authenticated(self):
-        # Invalid model id.
-        url = hf_bucket_url("bert-base", filename="pytorch_model.bin")
-        with self.assertRaisesRegex(RepositoryNotFoundError, "404 Client Error"):
-            _ = get_from_cache(url, use_auth_token="hf_sometoken")
-            # ^ TODO - if we decide to unskip this: use a real / functional token
-
-    def test_revision_not_found(self):
-        # Valid file but missing revision
-        url = hf_bucket_url(MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_INVALID)
-        with self.assertRaisesRegex(RevisionNotFoundError, "404 Client Error"):
-            _ = get_from_cache(url)
-
-    def test_standard_object(self):
-        url = hf_bucket_url(MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_DEFAULT)
-        filepath = get_from_cache(url, force_download=True)
-        metadata = filename_to_url(filepath)
-        self.assertEqual(metadata, (url, f'"{PINNED_SHA1}"'))
-
-    def test_standard_object_rev(self):
-        # Same object, but different revision
-        url = hf_bucket_url(MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_ONE_SPECIFIC_COMMIT)
-        filepath = get_from_cache(url, force_download=True)
-        metadata = filename_to_url(filepath)
-        self.assertNotEqual(metadata[1], f'"{PINNED_SHA1}"')
-        # Caution: check that the etag is *not* equal to the one from `test_standard_object`
-
-    def test_lfs_object(self):
-        url = hf_bucket_url(MODEL_ID, filename=WEIGHTS_NAME, revision=REVISION_ID_DEFAULT)
-        filepath = get_from_cache(url, force_download=True)
-        metadata = filename_to_url(filepath)
-        self.assertEqual(metadata, (url, f'"{PINNED_SHA256}"'))
-
     def test_has_file(self):
         self.assertTrue(has_file("hf-internal-testing/tiny-bert-pt-only", WEIGHTS_NAME))
         self.assertFalse(has_file("hf-internal-testing/tiny-bert-pt-only", TF2_WEIGHTS_NAME))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 00cc6a048b9b2d..d2271e87ebf178 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -614,7 +614,6 @@ def find_all_documented_objects():
     "absl",  # External module
     "add_end_docstrings",  # Internal, should never have been in the main init.
     "add_start_docstrings",  # Internal, should never have been in the main init.
-    "cached_path",  # Internal used for downloading models.
     "convert_tf_weight_name_to_pt_weight_name",  # Internal used to convert model weights
     "logger",  # Internal logger
     "logging",  # External module

From c6e979f1ff30e7d5fe5103f07be0fa44afbb0073 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Mon, 8 Aug 2022 18:56:51 +0530
Subject: [PATCH 074/162] update fsdp docs (#18521)

* updating fsdp documentation

* typo fix
---
 docs/source/en/main_classes/trainer.mdx | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.mdx
index e5807bd1380f8a..44c9d1d4b01973 100644
--- a/docs/source/en/main_classes/trainer.mdx
+++ b/docs/source/en/main_classes/trainer.mdx
@@ -567,14 +567,22 @@ as the model saving with FSDP activated is only available with recent fixes.
   For this, add `--fsdp full_shard` to the command line arguments. 
   - SHARD_GRAD_OP : Shards optimizer states + gradients across data parallel workers/GPUs.
     For this, add `--fsdp shard_grad_op` to the command line arguments.
+  - NO_SHARD : No sharding. For this, add `--fsdp no_shard` to the command line arguments.
 - To offload the parameters and gradients to the CPU, 
 add `--fsdp "full_shard offload"` or `--fsdp "shard_grad_op offload"` to the command line arguments.
 -  To automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`, 
 add `--fsdp "full_shard auto_wrap"` or `--fsdp "shard_grad_op auto_wrap"` to the command line arguments.
 - To enable both CPU offloading and auto wrapping, 
 add `--fsdp "full_shard offload auto_wrap"` or `--fsdp "shard_grad_op offload auto_wrap"` to the command line arguments.
-- If auto wrapping is enabled, please add `--fsdp_min_num_params <number>` to command line arguments.
-It specifies FSDP's minimum number of parameters for Default Auto Wrapping.
+- If auto wrapping is enabled, you can either use transformer based auto wrap policy or size based auto wrap policy.
+  - For transformer based auto wrap policy, please add `--fsdp_transformer_layer_cls_to_wrap <value>` to command line arguments.
+  This specifies the transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` ....
+  This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units. 
+  Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
+  Remaining layers including the shared embeddings are conviniently wrapped in same outermost FSDP unit.
+  Therefore, use this for transformer based models.
+  - For size based auto wrap policy, please add `--fsdp_min_num_params <number>` to command line arguments.
+  It specifies FSDP's minimum number of parameters for auto wrapping.
 
 **Few caveats to be aware of**
 - Mixed precision is currently not supported with FSDP as we wait for PyTorch to fix support for it.

From 2884397ec222ed02d98e64cf1610447e40eee67e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 8 Aug 2022 09:53:08 -0400
Subject: [PATCH 075/162] Fix compatibility with 1.12 (#17925)

* Fix compatibility with 1.12

* Remove pin from examples requirements

* Update torch scatter version

* Fix compatibility with 1.12

* Remove pin from examples requirements

* Update torch scatter version

* fix torch.onnx.symbolic_opset12 import

* Reject bad version

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml                             | 16 ++++++++--------
 examples/pytorch/_tests_requirements.txt         |  1 -
 setup.py                                         |  2 +-
 src/transformers/dependency_versions_table.py    |  2 +-
 .../models/deberta/modeling_deberta.py           |  4 +++-
 .../models/deberta_v2/modeling_deberta_v2.py     |  4 +++-
 src/transformers/models/sew_d/modeling_sew_d.py  |  4 +++-
 7 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 46bdc16006a943..83ee65248e9cac 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -83,7 +83,7 @@ jobs:
             - run: git lfs install
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.12.0+cpu.html
             - run: pip install tensorflow_probability
             - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - run: pip install git+https://github.com/huggingface/accelerate
@@ -124,7 +124,7 @@ jobs:
             - run: git lfs install
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.12.0+cpu.html
             - run: pip install tensorflow_probability
             - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - run: pip install git+https://github.com/huggingface/accelerate
@@ -159,7 +159,7 @@ jobs:
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.12.0+cpu.html
             - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - run: pip install git+https://github.com/huggingface/accelerate
             - save_cache:
@@ -198,7 +198,7 @@ jobs:
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.12.0+cpu.html
             - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - run: pip install git+https://github.com/huggingface/accelerate
             - save_cache:
@@ -231,7 +231,7 @@ jobs:
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.12.0+cpu.html
             - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - run: pip install git+https://github.com/huggingface/accelerate
             - save_cache:
@@ -269,7 +269,7 @@ jobs:
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.12.0+cpu.html
             - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - run: pip install git+https://github.com/huggingface/accelerate
             - save_cache:
@@ -439,7 +439,7 @@ jobs:
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.12.0+cpu.html
             - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.5-torch-{{ checksum "setup.py" }}
@@ -477,7 +477,7 @@ jobs:
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
             - run: pip install --upgrade pip
             - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.12.0+cpu.html
             - run: pip install https://github.com/kpu/kenlm/archive/master.zip
             - save_cache:
                   key: v0.5-torch-{{ checksum "setup.py" }}
diff --git a/examples/pytorch/_tests_requirements.txt b/examples/pytorch/_tests_requirements.txt
index 8c3c0c38434cba..979890f4b79c38 100644
--- a/examples/pytorch/_tests_requirements.txt
+++ b/examples/pytorch/_tests_requirements.txt
@@ -22,5 +22,4 @@ protobuf
 torchvision
 jiwer
 librosa
-torch < 1.12
 evaluate >= 0.2.0
diff --git a/setup.py b/setup.py
index 52b7837a88456d..391de689ec4b84 100644
--- a/setup.py
+++ b/setup.py
@@ -162,7 +162,7 @@
     "timeout-decorator",
     "timm",
     "tokenizers>=0.11.1,!=0.11.3,<0.13",
-    "torch>=1.0,<1.12",
+    "torch>=1.0,!=0.12.0",
     "torchaudio",
     "pyctcdecode>=0.3.0",
     "tqdm>=4.27",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index b0020ab8977867..bb98fcc024aa92 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -68,7 +68,7 @@
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
     "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.13",
-    "torch": "torch>=1.0,<1.12",
+    "torch": "torch>=1.0,!=0.12.0",
     "torchaudio": "torchaudio",
     "pyctcdecode": "pyctcdecode>=0.3.0",
     "tqdm": "tqdm>=4.27",
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 2d9e647c130cab..df3d4d95cd0170 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -187,6 +187,8 @@ def backward(ctx, grad_output):
 
     @staticmethod
     def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
+        from torch.onnx import symbolic_opset12
+
         dropout_p = local_ctx
         if isinstance(local_ctx, DropoutContext):
             dropout_p = local_ctx.dropout
@@ -198,7 +200,7 @@ def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, D
         # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like:
         # if opset_version < 12:
         #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
-        return torch.onnx.symbolic_opset12.dropout(g, input, dropout_p, train)
+        return symbolic_opset12.dropout(g, input, dropout_p, train)
 
 
 class StableDropout(nn.Module):
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 738981648af956..a513a8280ed51d 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -193,6 +193,8 @@ def backward(ctx, grad_output):
 
     @staticmethod
     def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
+        from torch.onnx import symbolic_opset12
+
         dropout_p = local_ctx
         if isinstance(local_ctx, DropoutContext):
             dropout_p = local_ctx.dropout
@@ -204,7 +206,7 @@ def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, D
         # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like:
         # if opset_version < 12:
         #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
-        return torch.onnx.symbolic_opset12.dropout(g, input, dropout_p, train)
+        return symbolic_opset12.dropout(g, input, dropout_p, train)
 
 
 # Copied from transformers.models.deberta.modeling_deberta.StableDropout
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index e582705ab09424..a9a231aec1d8e6 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -597,6 +597,8 @@ def backward(ctx, grad_output):
 
     @staticmethod
     def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
+        from torch.onnx import symbolic_opset12
+
         dropout_p = local_ctx
         if isinstance(local_ctx, DropoutContext):
             dropout_p = local_ctx.dropout
@@ -608,7 +610,7 @@ def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, D
         # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like:
         # if opset_version < 12:
         #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
-        return torch.onnx.symbolic_opset12.dropout(g, input, dropout_p, train)
+        return symbolic_opset12.dropout(g, input, dropout_p, train)
 
 
 # Copied from transformers.models.deberta.modeling_deberta.StableDropout

From e9ba674dbe84f4192e7c8cb4bf4ae287084f9b9e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Mon, 8 Aug 2022 09:54:10 -0400
Subject: [PATCH 076/162] Remove debug statement

---
 src/transformers/tokenization_utils_base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 91537ef46cc864..f85dc73cb659cb 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1713,7 +1713,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         resolved_vocab_files = {}
         unresolved_files = []
         for file_id, file_path in vocab_files.items():
-            print(file_id, file_path)
             if file_path is None:
                 resolved_vocab_files[file_id] = None
             else:

From dcb1685ee39d6c581bfe4a51e85c52490b7bbb55 Mon Sep 17 00:00:00 2001
From: Ankur Goyal <ankrgyl@gmail.com>
Date: Mon, 8 Aug 2022 07:22:17 -0700
Subject: [PATCH 077/162] Specify en in doc-builder README example (#18526)

Co-authored-by: Ankur Goyal <ankur@impira.com>
---
 docs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/README.md b/docs/README.md
index c8399a1ee60255..964a8b8b27a3a4 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -43,7 +43,7 @@ Once you have setup the `doc-builder` and additional packages, you can generate
 typing the following command:
 
 ```bash
-doc-builder build transformers docs/source/ --build_dir ~/tmp/test-build
+doc-builder build transformers docs/source/en/ --build_dir ~/tmp/test-build
 ```
 
 You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate

From 7072f661c9fa83f7bbb7d3250d66b716104229c8 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 8 Aug 2022 10:22:27 -0400
Subject: [PATCH 078/162] New cache fixes: add safeguard before looking in
 folders (#18522)

---
 src/transformers/utils/hub.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 5f9421912c49d7..570ff52e707532 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -133,6 +133,8 @@ def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]:
         cache_dir = TRANSFORMERS_CACHE
     elif isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
+    if not os.path.isdir(cache_dir):
+        return []
 
     cached_models = []
     for file in os.listdir(cache_dir):
@@ -210,6 +212,9 @@ def try_to_load_from_cache(cache_dir, repo_id, filename, revision=None):
     if not os.path.isdir(model_cache):
         # No cache for this model
         return None
+    for subfolder in ["refs", "snapshots"]:
+        if not os.path.isdir(os.path.join(model_cache, subfolder)):
+            return None
 
     # Resolve refs (for instance to convert main to the associated commit sha)
     cached_refs = os.listdir(os.path.join(model_cache, "refs"))
@@ -873,6 +878,8 @@ def get_all_cached_files(cache_dir=None):
         cache_dir = TRANSFORMERS_CACHE
     else:
         cache_dir = str(cache_dir)
+    if not os.path.isdir(cache_dir):
+        return []
 
     cached_files = []
     for file in os.listdir(cache_dir):

From c5e228eeb2a4e6385bf9b693488f7b3a98902b02 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 8 Aug 2022 17:44:10 +0200
Subject: [PATCH 079/162] unpin resampy (#18527)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 setup.py                                      | 3 +--
 src/transformers/dependency_versions_table.py | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 391de689ec4b84..05ec2c7617fd98 100644
--- a/setup.py
+++ b/setup.py
@@ -144,7 +144,6 @@
     "ray[tune]",
     "regex!=2019.12.17",
     "requests",
-    "resampy<0.3.1",
     "rjieba",
     "rouge-score",
     "sacrebleu>=1.4.12,<2.0.0",
@@ -270,7 +269,7 @@ def run(self):
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "resampy")  # resampy can be removed once unpinned.
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index bb98fcc024aa92..be3dba684bd58d 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -50,7 +50,6 @@
     "ray[tune]": "ray[tune]",
     "regex": "regex!=2019.12.17",
     "requests": "requests",
-    "resampy": "resampy<0.3.1",
     "rjieba": "rjieba",
     "rouge-score": "rouge-score",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",

From 6952e9b5721145d596d82d6add18548d7e21c3fe Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Mon, 8 Aug 2022 08:53:52 -0700
Subject: [PATCH 080/162] =?UTF-8?q?=20=E2=9C=A8=20update=20to=20use=20inte?=
 =?UTF-8?q?rlibrary=20links=20instead=20of=20Markdown=20(#18500)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/source/en/accelerate.mdx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/accelerate.mdx b/docs/source/en/accelerate.mdx
index c215758d47b6a3..02e05df3907492 100644
--- a/docs/source/en/accelerate.mdx
+++ b/docs/source/en/accelerate.mdx
@@ -22,7 +22,7 @@ Get started by installing 🤗 Accelerate:
 pip install accelerate
 ```
 
-Then import and create an [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) object. `Accelerator` will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
+Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
 
 ```py
 >>> from accelerate import Accelerator
@@ -32,7 +32,7 @@ Then import and create an [`Accelerator`](https://huggingface.co/docs/accelerate
 
 ## Prepare to accelerate
 
-The next step is to pass all the relevant training objects to the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) method. This includes your training and evaluation DataLoaders, a model and an optimizer:
+The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer:
 
 ```py
 >>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
@@ -42,7 +42,7 @@ The next step is to pass all the relevant training objects to the [`prepare`](ht
 
 ## Backward
 
-The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`backward`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.backward) method:
+The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`]method:
 
 ```py
 >>> for epoch in range(num_epochs):
@@ -121,7 +121,7 @@ accelerate launch train.py
 
 ### Train with a notebook
 
-🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to `notebook_launcher`:
+🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]:
 
 ```py
 >>> from accelerate import notebook_launcher

From 8a18ad9cd0124d5eb491959dc0edc8d5fe2f18a5 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Mon, 8 Aug 2022 09:31:31 -0700
Subject: [PATCH 081/162] Add example of multimodal usage to pipeline tutorial
 (#18498)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 📝 add example of multimodal usage to pipeline tutorial

* 🖍 apply feedbacks

* 🖍 apply niels feedback
---
 docs/source/en/pipeline_tutorial.mdx | 39 ++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
index 7929113209748d..95585b64359f49 100644
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.mdx
@@ -12,21 +12,21 @@ specific language governing permissions and limitations under the License.
 
 # Pipelines for inference
 
-The [`pipeline`] makes it simple to use any model from the [Model Hub](https://huggingface.co/models) for inference on a variety of tasks such as text generation, image segmentation and audio classification. Even if you don't have experience with a specific modality or understand the code powering the models, you can still use them with the [`pipeline`]! This tutorial will teach you to:
+The [`pipeline`] makes it simple to use any model from the [Hub](https://huggingface.co/models) for inference on any language, computer vision, speech, and multimodal tasks. Even if you don't have experience with a specific modality or aren't familiar with the underlying code behind the models, you can still use them for inference with the [`pipeline`]! This tutorial will teach you to:
 
 * Use a [`pipeline`] for inference.
 * Use a specific tokenizer or model.
-* Use a [`pipeline`] for audio and vision tasks.
+* Use a [`pipeline`] for audio, vision, and multimodal tasks.
 
 <Tip>
 
-Take a look at the [`pipeline`] documentation for a complete list of supported tasks.
+Take a look at the [`pipeline`] documentation for a complete list of supported tasks and available parameters.
 
 </Tip>
 
 ## Pipeline usage
 
-While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains all the specific task pipelines. The [`pipeline`] automatically loads a default model and tokenizer capable of inference for your task. 
+While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable of inference for your task.
 
 1. Start by creating a [`pipeline`] and specify an inference task:
 
@@ -67,7 +67,7 @@ Any additional parameters for your task can also be included in the [`pipeline`]
 
 ### Choose a model and tokenizer
 
-The [`pipeline`] accepts any model from the [Model Hub](https://huggingface.co/models). There are tags on the Model Hub that allow you to filter for a model you'd like to use for your task. Once you've picked an appropriate model, load it with the corresponding `AutoModelFor` and [`AutoTokenizer`] class. For example, load the [`AutoModelForCausalLM`] class for a causal language modeling task:
+The [`pipeline`] accepts any model from the [Hub](https://huggingface.co/models). There are tags on the Hub that allow you to filter for a model you'd like to use for your task. Once you've picked an appropriate model, load it with the corresponding `AutoModelFor` and [`AutoTokenizer`] class. For example, load the [`AutoModelForCausalLM`] class for a causal language modeling task:
 
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -95,7 +95,7 @@ Pass your input text to the [`pipeline`] to generate some text:
 
 ## Audio pipeline
 
-The flexibility of the [`pipeline`] means it can also be extended to audio tasks.
+The [`pipeline`] also supports audio tasks like audio classification and automatic speech recognition.
 
 For example, let's classify the emotion in this audio clip:
 
@@ -129,9 +129,9 @@ Pass the audio file to the [`pipeline`]:
 
 ## Vision pipeline
 
-Finally, using a [`pipeline`] for vision tasks is practically identical.
+Using a [`pipeline`] for vision tasks is practically identical.
 
-Specify your vision task and pass your image to the classifier. The imaage can be a link or a local path to the image. For example, what species of cat is shown below?
+Specify your task and pass your image to the classifier. The image can be a link or a local path to the image. For example, what species of cat is shown below?
 
 ![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
 
@@ -146,3 +146,26 @@ Specify your vision task and pass your image to the classifier. The imaage can b
 >>> preds
 [{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
 ```
+
+## Multimodal pipeline
+
+The [`pipeline`] supports more than one modality. For example, a visual question answering (VQA) task combines text and image. Feel free to use any image link you like and a question you want to ask about the image. The image can be a URL or a local path to the image.
+
+For example, if you use the same image from the vision pipeline above:
+
+```py
+>>> image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+>>> question = "Where is the cat?"
+```
+
+Create a pipeline for `vqa` and pass it the image and question:
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(task="vqa")
+>>> preds = vqa(image=image, question=question)
+>>> preds = [{"score": round(pred["score"], 4), "answer": pred["answer"]} for pred in preds]
+>>> preds
+[{'score': 0.9112, 'answer': 'snow'}, {'score': 0.8796, 'answer': 'in snow'}, {'score': 0.6717, 'answer': 'outside'}, {'score': 0.0291, 'answer': 'on ground'}, {'score': 0.027, 'answer': 'ground'}]
+```
\ No newline at end of file

From e9c67f74560908304718c1cee7a9c3785db4e6fe Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Mon, 8 Aug 2022 19:28:51 +0200
Subject: [PATCH 082/162] [VideoMAE] Add model to doc tests (#18523)

* Add videomae to doc tests

* Add pip install decord

Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
---
 docker/transformers-all-latest-gpu/Dockerfile | 2 ++
 utils/documentation_tests.txt                 | 1 +
 2 files changed, 3 insertions(+)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index d82c9f7c777c7e..e97a91f4246fb4 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -45,6 +45,8 @@ RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
+RUN python3 -m pip install --no-cache-dir decord
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index d523181eae2bde..1941a7343a6bc9 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -70,6 +70,7 @@ src/transformers/models/trocr/modeling_trocr.py
 src/transformers/models/unispeech/modeling_unispeech.py
 src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
 src/transformers/models/van/modeling_van.py
+src/transformers/models/videomae/modeling_videomae.py
 src/transformers/models/vilt/modeling_vilt.py
 src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
 src/transformers/models/vit/modeling_vit.py

From 7aa5bfd4058e0064e8ae955affe3c0c79a75ebfa Mon Sep 17 00:00:00 2001
From: Mishig Davaadorj <dmishig@gmail.com>
Date: Mon, 8 Aug 2022 20:33:34 +0200
Subject: [PATCH 083/162] Update perf_train_gpu_one.mdx (#18532)

---
 docs/source/en/perf_train_gpu_one.mdx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.mdx
index ba5bcb456d2220..56cd6c6f10e333 100644
--- a/docs/source/en/perf_train_gpu_one.mdx
+++ b/docs/source/en/perf_train_gpu_one.mdx
@@ -719,13 +719,16 @@ For some applications, such as pretraining large language models, applying all t
 Another use case for training on many GPUs is if the model does not fit on a single GPU with all the mentioned tricks. There are still more methods we can apply although life starts to get a bit more complicated. This usually involves some form of pipeline or tensor parallelism where the model itself is distributed across several GPUs. One can also make use of DeepSpeed which implements some of these parallelism strategies along with some more optimization to reduce the memory footprint such as partitioning the optimizer states. You can read more about this in the ["Multi-GPU training" section](perf_train_gpu_many).
 
 ## Inference with torchdynamo
+
 TorchDynamo is a new tracer that uses Python’s frame evaluation API to automatically create FX traces from existing PyTorch programs. After capturing the FX graph, different backends can be deployed to lower the graph to an optimized engine. One solution is using the [TensorRT](https://developer.nvidia.com/tensorrt) or NVFuser as backend. You can choose one option below for performance boost.
+
 ```
 TrainingArguments(torchdynamo="eager")      #enable eager model GPU. No performance boost
 TrainingArguments(torchdynamo="nvfuser")    #enable nvfuser
 TrainingArguments(torchdynamo="fx2trt")     #enable tensorRT fp32
 TrainingArguments(torchdynamo="fx2trt-f16") #enable tensorRT fp16
 ```
+
 This feature involves 3 different libraries. To install them, please follow the instructions below:  
 - [Torchdynamo installation](https://github.com/pytorch/torchdynamo#requirements-and-setup)  
 - [Functorch installation](https://github.com/pytorch/functorch#install)  

From 5606dba1ab8fa5737146dbc9d3b1c6a7f738c762 Mon Sep 17 00:00:00 2001
From: Rasmus Arpe Fogh Jensen <Rasmus.arpe@gmail.com>
Date: Mon, 8 Aug 2022 21:52:47 +0200
Subject: [PATCH 084/162] Update no_trainer.py scripts to include accelerate
 gradient accumulation wrapper (#18473)

* Added accelerate gradient accumulation wrapper to run_image_classification_no_trainer.py example script

* make fixup changes

* PR comments

* changed input to Acceletor based on PR comment, ran make fixup

* Added comment explaining the sync_gradients statement

* Fixed lr scheduler max steps

* Changed run_clm_no_trainer.py script to use accelerate gradient accum wrapper

* Fixed all scripts except wav2vec2 pretraining to use accelerate gradient accum wrapper

* Added accelerate gradient accum wrapper for wav2vec2_pretraining_no_trainer.py script

* make fixup and lr_scheduler step inserted back into run_qa_beam_search_no_trainer.py

* removed changes to run_wav2vec2_pretraining_no_trainer.py script and fixed using wrong constant in qa_beam_search_no_trainer.py script
---
 .../run_image_classification_no_trainer.py    | 34 ++++++++++-------
 .../language-modeling/run_clm_no_trainer.py   | 34 ++++++++++-------
 .../language-modeling/run_mlm_no_trainer.py   | 34 ++++++++++-------
 .../multiple-choice/run_swag_no_trainer.py    | 36 +++++++++++-------
 .../run_qa_beam_search_no_trainer.py          | 37 +++++++++++++------
 .../question-answering/run_qa_no_trainer.py   | 35 +++++++++++-------
 .../run_semantic_segmentation_no_trainer.py   | 34 ++++++++++-------
 .../run_summarization_no_trainer.py           | 33 ++++++++++-------
 8 files changed, 173 insertions(+), 104 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index f10a54add79158..1bd190d1303e9a 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -212,9 +212,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     logger.info(accelerator.state)
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
@@ -384,8 +389,8 @@ def collate_fn(examples):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -467,17 +472,20 @@ def collate_fn(examples):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 21dc568fd44822..3fd67d5fbf66e4 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -249,9 +249,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -486,8 +491,8 @@ def group_texts(examples):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -567,17 +572,20 @@ def group_texts(examples):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index b7b085e5b61bea..80dfcf9a9194e5 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -258,9 +258,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -530,8 +535,8 @@ def group_texts(examples):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -611,17 +616,20 @@ def group_texts(examples):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index a3868434b28504..eeb04e417fdfd6 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -65,7 +65,7 @@
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a multiple choice task")
     parser.add_argument(
         "--dataset_name",
         type=str,
@@ -284,9 +284,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -483,8 +488,8 @@ def preprocess_function(examples):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -567,17 +572,20 @@ def preprocess_function(examples):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 69ddf24ab5aa49..370dd3f43d9545 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -297,8 +297,16 @@ def main():
     send_example_telemetry("run_qa_beam_search_no_trainer", args)
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
-    accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers
+    # in the environment
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -739,8 +747,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -818,17 +826,22 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+
+                accelerator.backward(loss)
+
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index d98dca22bf2e48..6bf4eb28e99418 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -337,9 +337,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -757,8 +762,8 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -839,17 +844,21 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 7ffb876d4db58f..30cb7cc53ae318 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -326,9 +326,14 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
     logger.info(accelerator.state, main_process_only=False)
     if accelerator.is_local_main_process:
         datasets.utils.logging.set_verbosity_warning()
@@ -487,8 +492,8 @@ def preprocess_val(example_batch):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -563,17 +568,20 @@ def preprocess_val(example_batch):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index ca9ef6ba9fa241..96781b6dcadbdd 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -330,9 +330,13 @@ def main():
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
-    accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
-    )
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
     if args.source_prefix is None and args.model_name_or_path in [
         "t5-small",
         "t5-base",
@@ -552,8 +556,8 @@ def postprocess_text(preds, labels):
     lr_scheduler = get_scheduler(
         name=args.lr_scheduler_type,
         optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
     # Prepare everything with our `accelerator`.
@@ -635,17 +639,20 @@ def postprocess_text(preds, labels):
                 if resume_step is not None and step < resume_step:
                     completed_steps += 1
                     continue
-            outputs = model(**batch)
-            loss = outputs.loss
-            # We keep track of the loss at each epoch
-            if args.with_tracking:
-                total_loss += loss.detach().float()
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
                 progress_bar.update(1)
                 completed_steps += 1
 

From 5b29a580aba1a2324c22bd2ccba5e65e35ff6848 Mon Sep 17 00:00:00 2001
From: Ian Castillo <7807897+donelianc@users.noreply.github.com>
Date: Mon, 8 Aug 2022 21:53:43 +0200
Subject: [PATCH 085/162] Add Spanish translation of
 converting_tensorflow_models.mdx (#18512)

* Add file in spanish docs to be translated

* Finish translation to Spanish

* Improve Spanish  wording

* Add suggested changes from review
---
 docs/source/es/_toctree.yml                   |   2 +
 .../es/converting_tensorflow_models.mdx       | 149 ++++++++++++++++++
 2 files changed, 151 insertions(+)
 create mode 100644 docs/source/es/converting_tensorflow_models.mdx

diff --git a/docs/source/es/_toctree.yml b/docs/source/es/_toctree.yml
index 4145a9649139cc..bd5f917aea4ccb 100644
--- a/docs/source/es/_toctree.yml
+++ b/docs/source/es/_toctree.yml
@@ -39,6 +39,8 @@
     title: Ejecutar el entrenamiento en Amazon SageMaker
   - local: multilingual
     title: Modelos multilingües para inferencia
+  - local: converting_tensorflow_models
+    title: Convertir checkpoints de TensorFlow
   title: Guías prácticas
 - sections:
   - local: philosophy
diff --git a/docs/source/es/converting_tensorflow_models.mdx b/docs/source/es/converting_tensorflow_models.mdx
new file mode 100644
index 00000000000000..2ab15e81b2508a
--- /dev/null
+++ b/docs/source/es/converting_tensorflow_models.mdx
@@ -0,0 +1,149 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Convertir checkpoints de Tensorflow
+
+Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en inglés) para convertir puntos de control (_checkpoints_) originales de Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM en modelos que se puedan cargar utilizando los métodos `from_pretrained` de la biblioteca.
+
+<Tip>
+
+Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers-cli**) disponible en cualquier instalación de transformers >= 2.3.0.
+
+La siguiente documentación refleja el formato para el comando **transformers-cli convert**.
+
+</Tip>
+
+## BERT
+
+Puedes convertir cualquier checkpoint de TensorFlow para BERT (en particular, [los modelos pre-entrenados y publicados por Google](https://github.com/google-research/bert#pre-trained-models)) en un archivo de PyTorch mediante el script [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).
+
+Esta CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `bert_model.ckpt`) y el archivo de configuración asociado (`bert_config.json`), y crea un modelo PyTorch para esta configuración, carga los pesos del checkpoint de TensorFlow en el modelo de PyTorch y guarda el modelo resultante en un archivo estándar de PyTorch que se puede importar usando `from_pretrained()` (ve el ejemplo en [Tour rápido](quicktour), [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py)).
+
+Solo necesitas ejecutar este script **una vez** para convertir un modelo a PyTorch. Después, puedes ignorar el checkpoint de TensorFlow (los tres archivos que comienzan con `bert_model.ckpt`), pero asegúrate de conservar el archivo de configuración (`bert_config.json`) y el archivo de vocabulario (`vocab.txt`) ya que estos también son necesarios para el modelo en PyTorch.
+
+Para ejecutar este script deberás tener instalado TensorFlow y PyTorch (`pip install tensorflow`). El resto del repositorio solo requiere PyTorch.
+
+Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pre-entrenado:
+
+```bash
+export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type bert \
+  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+  --config $BERT_BASE_DIR/bert_config.json \
+  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+```
+
+Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/bert#pre-trained-models).
+
+## ALBERT
+
+Convierte los checkpoints del modelo ALBERT de TensorFlow a PyTorch usando el script [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).
+
+La CLI toma como entrada un checkpoint de TensorFlow (tres archivos que comienzan con `model.ckpt-best`) y el archivo de configuración adjunto (`albert_config.json`), luego crea y guarda un modelo de PyTorch. Para ejecutar esta conversión deberás tener instalados TensorFlow y PyTorch.
+
+Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entrenado:
+
+```bash
+export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+transformers-cli convert --model_type albert \
+  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+  --config $ALBERT_BASE_DIR/albert_config.json \
+  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+```
+
+Puedes descargar los modelos pre-entrenados de Google para la conversión [aquí](https://github.com/google-research/albert#pre-trained-models).
+
+## OpenAI GPT
+
+Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado, asumiendo que tu checkpoint de NumPy se guarda con el mismo formato que el modelo pre-entrenado de OpenAI (más información [aquí](https://github.com/openai/finetune-transformer-lm)):
+
+```bash
+export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+
+transformers-cli convert --model_type gpt \
+  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+```
+
+## OpenAI GPT-2
+
+Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entrenado (más información [aquí](https://github.com/openai/gpt-2)):
+
+```bash
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+
+transformers-cli convert --model_type gpt2 \
+  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config OPENAI_GPT2_CONFIG] \
+  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+```
+
+## Transformer-XL
+
+Aquí hay un ejemplo del proceso para convertir un modelo Transformer-XL pre-entrenado (más información [aquí](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)):
+
+```bash
+export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+
+transformers-cli convert --model_type transfo_xl \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--config TRANSFO_XL_CONFIG] \
+  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+```
+
+## XLNet
+
+Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
+
+```bash
+export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+
+transformers-cli convert --model_type xlnet \
+  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+  --config $TRANSFO_XL_CONFIG_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+  [--finetuning_task_name XLNET_FINETUNED_TASK] \
+```
+
+## XLM
+
+Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado:
+
+```bash
+export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+
+transformers-cli convert --model_type xlm \
+  --tf_checkpoint $XLM_CHECKPOINT_PATH \
+  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+ [--config XML_CONFIG] \
+ [--finetuning_task_name XML_FINETUNED_TASK]
+```
+
+## T5
+
+Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado:
+
+```bash
+export T5=/path/to/t5/uncased_L-12_H-768_A-12
+
+transformers-cli convert --model_type t5 \
+  --tf_checkpoint $T5/t5_model.ckpt \
+  --config $T5/t5_config.json \
+  --pytorch_dump_output $T5/pytorch_model.bin
+```

From defa14cd50b29bb569d3f41d636a9184738177d4 Mon Sep 17 00:00:00 2001
From: AguilaCudicio <aguila.cudicio@gmail.com>
Date: Mon, 8 Aug 2022 16:54:11 -0300
Subject: [PATCH 086/162] Spanish translation of summarization.mdx (#15947)
 (#18477)

* Add Spanish translation of summarization.mdx

* Apply suggestions from code review

Co-authored-by: Omar U. Espejel <espejelomar@gmail.com>

Co-authored-by: Omar U. Espejel <espejelomar@gmail.com>
---
 docs/source/es/_toctree.yml            |   4 +-
 docs/source/es/tasks/summarization.mdx | 222 +++++++++++++++++++++++++
 2 files changed, 225 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/es/tasks/summarization.mdx

diff --git a/docs/source/es/_toctree.yml b/docs/source/es/_toctree.yml
index bd5f917aea4ccb..60566b9e6f9b47 100644
--- a/docs/source/es/_toctree.yml
+++ b/docs/source/es/_toctree.yml
@@ -30,6 +30,8 @@
   - sections:
     - local: tasks/language_modeling
       title: Modelado de lenguaje
+    - local: tasks/summarization
+      title: Generación de resúmenes
     - local: tasks/image_classification
       title: Clasificación de imágenes
     title: Fine-tuning para tareas posteriores
@@ -47,4 +49,4 @@
     title: Filosofía
   - local: bertology
     title: BERTología
-  title: Guías conceptuales
\ No newline at end of file
+  title: Guías conceptuales
diff --git a/docs/source/es/tasks/summarization.mdx b/docs/source/es/tasks/summarization.mdx
new file mode 100644
index 00000000000000..c09c4b0b833a13
--- /dev/null
+++ b/docs/source/es/tasks/summarization.mdx
@@ -0,0 +1,222 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Generación de resúmenes
+
+<Youtube id="yHnr5Dk2zCI"/>
+
+La generación de resúmenes (summarization, en inglés) crea una versión más corta de un documento o un artículo que resume toda su información importante. Junto con la traducción, es un ejemplo de una tarea que puede ser formulada como una tarea secuencia a secuencia. La generación de resúmenes puede ser:
+
+- Extractiva: Extrae la información más relevante de un documento.
+- Abstractiva: Genera un texto nuevo que captura la información más importante.
+
+Esta guía te mostrará cómo puedes hacer fine-tuning del modelo [T5](https://huggingface.co/t5-small) sobre el subset de proyectos de ley del estado de California, dentro del dataset [BillSum](https://huggingface.co/datasets/billsum) para hacer generación de resúmenes abstractiva.
+
+<Tip>
+
+Consulta la [página de la tarea](https://huggingface.co/tasks/summarization) de generación de resúmenes para obtener más información sobre sus modelos, datasets y métricas asociadas.
+
+</Tip>
+
+## Carga el dataset BillSum
+
+Carga el dataset BillSum de la biblioteca 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> billsum = load_dataset("billsum", split="ca_test")
+```
+
+Divide el dataset en un set de train y un set de test:
+
+```py
+>>> billsum = billsum.train_test_split(test_size=0.2)
+```
+
+A continuación, observa un ejemplo:
+
+```py
+>>> billsum["train"][0]
+{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
+ 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
+ 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
+```
+
+El campo `text` es el input y el campo `summary` es el objetivo.
+
+## Preprocesa
+
+Carga el tokenizador T5 para procesar `text` y `summary`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
+```
+
+La función de preprocesamiento necesita:
+
+1. Agregar un prefijo al input; una clave para que T5 sepa que se trata de una tarea de generación de resúmenes. Algunos modelos capaces de realizar múltiples tareas de NLP requieren una clave que indique la tarea específica.
+2. Usar el argumento `text_target` para tokenizar etiquetas.
+3. Truncar secuencias para que no sean más largas que la longitud máxima fijada por el parámetro `max_length`.
+
+```py
+>>> prefix = "summarize: "
+
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + doc for doc in examples["text"]]
+...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+
+...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
+
+...     model_inputs["labels"] = labels["input_ids"]
+...     return model_inputs
+```
+
+Usa la función [`~datasets.Dataset.map`] de 🤗 Datasets para aplicar la función de preprocesamiento sobre el dataset en su totalidad. Puedes acelerar la función `map` configurando el argumento `batched=True` para procesar múltiples elementos del dataset a la vez:
+
+```py
+>>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
+```
+
+Usa [`DataCollatorForSeq2Seq`] para crear un lote de ejemplos. Esto también *rellenará dinámicamente* tu texto y etiquetas a la dimensión del elemento más largo del lote para que tengan un largo uniforme. Si bien es posible rellenar tu texto en la función `tokenizer` mediante el argumento `padding=True`, el rellenado dinámico es más eficiente.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Entrenamiento
+
+<frameworkcontent>
+<pt>
+Carga T5 con [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+```
+
+<Tip>
+
+Para familiarizarte con el proceso para realizar fine-tuning sobre un modelo con [`Trainer`], ¡mira el tutorial básico [aquí](../training#finetune-with-trainer)!
+
+</Tip>
+
+En este punto, solo faltan tres pasos:
+
+1. Definir tus hiperparámetros de entrenamiento en [`Seq2SeqTrainingArguments`].
+2. Pasarle los argumentos de entrenamiento a [`Seq2SeqTrainer`] junto con el modelo, dataset y data collator.
+3. Llamar [`~Trainer.train`] para realizar el fine-tuning sobre tu modelo.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="./results",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=1,
+...     fp16=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_billsum["train"],
+...     eval_dataset=tokenized_billsum["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+</pt>
+<tf>
+Para hacer fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`~datasets.Dataset.to_tf_dataset`]. Especifica los inputs y etiquetas en `columns`, el tamaño de lote, el data collator, y si es necesario mezclar el dataset:
+
+```py
+>>> tf_train_set = tokenized_billsum["train"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = tokenized_billsum["test"].to_tf_dataset(
+...     columns=["attention_mask", "input_ids", "labels"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+<Tip>
+
+Para familiarizarte con el fine-tuning con Keras, ¡mira el tutorial básico [aquí](training#finetune-with-keras)!
+
+</Tip>
+
+Crea la función optimizadora, establece la tasa de aprendizaje y algunos hiperparámetros de entrenamiento:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+Carga T5 con [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
+```
+
+Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> model.compile(optimizer=optimizer)
+```
+
+Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) para realizar el fine-tuning del modelo:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Para un ejemplo con mayor profundidad de cómo hacer fine-tuning a un modelo para generación de resúmenes, revisa la
+[notebook en PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
+o a la [notebook en TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
+
+</Tip>
\ No newline at end of file

From 87271d1980effe5e216df8186acd95d94e632389 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 8 Aug 2022 23:48:49 +0200
Subject: [PATCH 087/162] Let's not cast them all (#18471)

* add correct dtypes when checking for params dtype

* forward contrib credits

* Update src/transformers/modeling_utils.py

Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>

* more comments

- added more comments on why we cast only floating point parameters

* Update src/transformers/modeling_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: sgugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
---
 src/transformers/modeling_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 2a86128c221bec..8bce35f9e336d9 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -543,8 +543,10 @@ def _load_state_dict_into_meta_model(
             param_name = param_name[len(start_prefix) :]
 
         module_name = param_name
-        # We convert floating dtypes to the `dtype` passed.
-        if dtype is not None and not str(param.dtype).startswith("torch.int"):
+
+        # We convert floating dtypes to the `dtype` passed.We want to keep the buffers/params
+        # in int/uint/bool and not cast them.
+        if dtype is not None and torch.is_floating_point(param):
             param = param.to(dtype)
 
         if device_map is None:

From a9b29687cd4b0560ab9dbcfc8f7f21b7e5843639 Mon Sep 17 00:00:00 2001
From: Niklas Hansson <niklas.sven.hansson@gmail.com>
Date: Tue, 9 Aug 2022 09:35:05 +0200
Subject: [PATCH 088/162] fix: data2vec-vision Onnx ready-made configuration.
 (#18427)

* feat: add the data2vec conf that are missing https://huggingface.co/docs/transformers/serialization

* fix: wrong config
---
 src/transformers/onnx/features.py | 6 ++++++
 tests/onnx/test_onnx_v2.py        | 1 +
 2 files changed, 7 insertions(+)

diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py
index e7c24a8ad97a81..3eea94c8c1a64e 100644
--- a/src/transformers/onnx/features.py
+++ b/src/transformers/onnx/features.py
@@ -229,6 +229,12 @@ class FeaturesManager:
             "question-answering",
             onnx_config_cls="models.data2vec.Data2VecTextOnnxConfig",
         ),
+        "data2vec-vision": supported_features_mapping(
+            "default",
+            "image-classification",
+            "image-segmentation",
+            onnx_config_cls="models.data2vec.Data2VecVisionOnnxConfig",
+        ),
         "deberta": supported_features_mapping(
             "default",
             "masked-lm",
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index cfc58dd335c30d..c22406841afdae 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -207,6 +207,7 @@ def test_values_override(self):
     ("deit", "facebook/deit-small-patch16-224"),
     ("beit", "microsoft/beit-base-patch16-224"),
     ("data2vec-text", "facebook/data2vec-text-base"),
+    ("data2vec-vision", "facebook/data2vec-vision-base"),
     ("perceiver", "deepmind/language-perceiver", ("masked-lm", "sequence-classification")),
     ("perceiver", "deepmind/vision-perceiver-conv", ("image-classification",)),
     ("yolos", "hustvl/yolos-tiny"),

From 24f688fcd4f3bc5278c7c1bb67a87533c1ea3656 Mon Sep 17 00:00:00 2001
From: Thomas Chaigneau <t.chaigneau.tc@gmail.com>
Date: Tue, 9 Aug 2022 09:46:53 +0200
Subject: [PATCH 089/162] Add mt5 onnx config (#18394)

* update features

* MT5OnnxConfig added with updated with tests and docs

* fix imports

* fix onnc_config_cls for mt5

Co-authored-by: Thomas Chaigneau <thomas.deeptools.ai>
---
 docs/source/en/serialization.mdx              |  1 +
 src/transformers/models/mt5/__init__.py       |  4 +--
 .../models/mt5/configuration_mt5.py           | 28 +++++++++++++++++++
 src/transformers/onnx/features.py             |  7 +++++
 tests/onnx/test_onnx_v2.py                    |  1 +
 5 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index e41ccae949e8bb..9561bbd8ec77c1 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -79,6 +79,7 @@ Ready-made configurations include the following architectures:
 - mBART
 - MobileBERT
 - MobileViT
+- MT5
 - OpenAI GPT-2
 - Perceiver
 - PLBart
diff --git a/src/transformers/models/mt5/__init__.py b/src/transformers/models/mt5/__init__.py
index 3f04a256918bc3..f6e717bd875b52 100644
--- a/src/transformers/models/mt5/__init__.py
+++ b/src/transformers/models/mt5/__init__.py
@@ -43,7 +43,7 @@
 
 MT5TokenizerFast = T5TokenizerFast
 
-_import_structure = {"configuration_mt5": ["MT5Config"]}
+_import_structure = {"configuration_mt5": ["MT5Config", "MT5OnnxConfig"]}
 
 try:
     if not is_torch_available():
@@ -71,7 +71,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_mt5 import MT5Config
+    from .configuration_mt5 import MT5Config, MT5OnnxConfig
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index ad0345f53189e9..3e72831ad25fbc 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -13,8 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ mT5 model configuration"""
+from typing import Mapping
 
 from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxSeq2SeqConfigWithPast
 from ...utils import logging
 
 
@@ -143,3 +145,29 @@ def num_attention_heads(self):
     @property
     def num_hidden_layers(self):
         return self.num_layers
+
+
+# Copied from transformers.models.t5.configuration_t5.T5OnnxConfig
+class MT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = {
+            "input_ids": {0: "batch", 1: "encoder_sequence"},
+            "attention_mask": {0: "batch", 1: "encoder_sequence"},
+        }
+        if self.use_past:
+            common_inputs["attention_mask"][1] = "past_encoder_sequence + sequence"
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+
+        return common_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py
index 3eea94c8c1a64e..8d8b8190e46819 100644
--- a/src/transformers/onnx/features.py
+++ b/src/transformers/onnx/features.py
@@ -383,6 +383,13 @@ class FeaturesManager:
             "image-classification",
             onnx_config_cls="models.mobilevit.MobileViTOnnxConfig",
         ),
+        "mt5": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            onnx_config_cls="models.mt5.MT5OnnxConfig",
+        ),
         "m2m-100": supported_features_mapping(
             "default",
             "default-with-past",
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index c22406841afdae..98ab0fad131e01 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -224,6 +224,7 @@ def test_values_override(self):
     ("mbart", "sshleifer/tiny-mbart"),
     ("t5", "t5-small"),
     ("marian", "Helsinki-NLP/opus-mt-en-de"),
+    ("mt5", "google/mt5-base"),
     ("m2m-100", "facebook/m2m100_418M"),
     ("blenderbot-small", "facebook/blenderbot_small-90M"),
     ("blenderbot", "facebook/blenderbot-400M-distill"),

From c4654370dc1591bc9ccfeee92be308aba19879d3 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 9 Aug 2022 14:33:41 +0200
Subject: [PATCH 090/162] Minor update of `run_call_with_unpacked_inputs`
 (#18541)

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/modeling_tf_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 703440d80ad71b..68ee4117a2f9db 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -398,7 +398,7 @@ def run_call_with_unpacked_inputs(self, *args, **kwargs):
         fn_args_and_kwargs.update(dict(zip(func.__code__.co_varnames[1:], args)))
 
         # Encoder Decoder models delegate the application of the configuration options to their inner models.
-        if "encoder_decoder" in str(self).lower():
+        if "EncoderDecoder" in self.__class__.__name__:
             config = None
         else:
             config = self.config

From cafb76e4c0fffe57d8e20b24ee69ebfc6640109f Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 9 Aug 2022 14:47:18 +0200
Subject: [PATCH 091/162] BART - Fix attention mask device issue on copied
 models (#18540)

* attempt to fix attn mask device

* fix bart `_prepare_decoder_attention_mask`

- add correct device
- run `make fix-copies` to propagate the fix
---
 src/transformers/models/bart/modeling_bart.py                 | 4 +++-
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py        | 4 +++-
 src/transformers/models/blenderbot/modeling_blenderbot.py     | 4 +++-
 .../models/blenderbot_small/modeling_blenderbot_small.py      | 4 +++-
 src/transformers/models/marian/modeling_marian.py             | 4 +++-
 src/transformers/models/mbart/modeling_mbart.py               | 4 +++-
 src/transformers/models/opt/modeling_opt.py                   | 4 +++-
 src/transformers/models/pegasus/modeling_pegasus.py           | 4 +++-
 src/transformers/models/plbart/modeling_plbart.py             | 4 +++-
 9 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 011eee1f24b54b..8411cc6cefefed 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -915,7 +915,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index ce5040e92c7fa3..3cdfe7d2ffe097 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -2116,7 +2116,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 2a53099d9c4ce5..303a5c4f256997 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -854,7 +854,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index e5b717ef9c181d..8dac9b6a7573c2 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -850,7 +850,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 32e59098ef11dc..26dc6b12dc9fe6 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -860,7 +860,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index b9057178a03227..16ea95bc0aedde 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -913,7 +913,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 8a1c021c845052..419c2391e4c708 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -534,7 +534,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 9e797af035cf0a..5a144aa3e9c514 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -880,7 +880,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index eb8b5d2b41671c..d03ddf33ebfa7a 100755
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -887,7 +887,9 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
             combined_attention_mask = (
                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
             )

From a25b1b3c169ab31d538d781f877dcc78ba365721 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 9 Aug 2022 18:50:02 +0200
Subject: [PATCH 092/162] Adding a new `align_to_words` param to qa pipeline.
 (#18010)

* Adding a new `align_to_words` param to qa pipeline.

* Update src/transformers/pipelines/question_answering.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Import protection.

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../pipelines/question_answering.py           | 48 +++++++++++++++----
 .../test_pipelines_question_answering.py      | 23 +++++++++
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index d58762035ef7f8..6f07382dc57c6b 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -8,7 +8,14 @@
 from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
 from ..modelcard import ModelCard
 from ..tokenization_utils import PreTrainedTokenizer
-from ..utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available, logging
+from ..utils import (
+    PaddingStrategy,
+    add_end_docstrings,
+    is_tf_available,
+    is_tokenizers_available,
+    is_torch_available,
+    logging,
+)
 from .base import PIPELINE_INIT_ARGS, ArgumentHandler, ChunkPipeline
 
 
@@ -18,6 +25,9 @@
     from ..modeling_tf_utils import TFPreTrainedModel
     from ..modeling_utils import PreTrainedModel
 
+    if is_tokenizers_available():
+        import tokenizers
+
 if is_tf_available():
     import tensorflow as tf
 
@@ -180,6 +190,7 @@ def _sanitize_parameters(
         max_seq_len=None,
         max_question_len=None,
         handle_impossible_answer=None,
+        align_to_words=None,
         **kwargs
     ):
         # Set defaults values
@@ -208,6 +219,8 @@ def _sanitize_parameters(
             postprocess_params["max_answer_len"] = max_answer_len
         if handle_impossible_answer is not None:
             postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+        if align_to_words is not None:
+            postprocess_params["align_to_words"] = align_to_words
         return preprocess_params, {}, postprocess_params
 
     def __call__(self, *args, **kwargs):
@@ -243,6 +256,9 @@ def __call__(self, *args, **kwargs):
                 The maximum length of the question after tokenization. It will be truncated if needed.
             handle_impossible_answer (`bool`, *optional*, defaults to `False`):
                 Whether or not we accept impossible as an answer.
+            align_to_words (`bool`, *optional*, defaults to `True`):
+                Attempts to align the answer to real words. Improves quality on space separated langages. Might hurt on
+                non-space-separated languages (like Japanese or Chinese)
 
         Return:
             A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
@@ -386,6 +402,7 @@ def postprocess(
         top_k=1,
         handle_impossible_answer=False,
         max_answer_len=15,
+        align_to_words=True,
     ):
         min_null_score = 1000000  # large and positive
         answers = []
@@ -464,15 +481,8 @@ def postprocess(
                 for s, e, score in zip(starts, ends, scores):
                     s = s - offset
                     e = e - offset
-                    try:
-                        start_word = enc.token_to_word(s)
-                        end_word = enc.token_to_word(e)
-                        start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
-                        end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
-                    except Exception:
-                        # Some tokenizers don't really handle words. Keep to offsets then.
-                        start_index = enc.offsets[s][0]
-                        end_index = enc.offsets[e][1]
+
+                    start_index, end_index = self.get_indices(enc, s, e, sequence_index, align_to_words)
 
                     answers.append(
                         {
@@ -490,6 +500,24 @@ def postprocess(
             return answers[0]
         return answers
 
+    def get_indices(
+        self, enc: "tokenizers.Encoding", s: int, e: int, sequence_index: int, align_to_words: bool
+    ) -> Tuple[int, int]:
+        if align_to_words:
+            try:
+                start_word = enc.token_to_word(s)
+                end_word = enc.token_to_word(e)
+                start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
+                end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
+            except Exception:
+                # Some tokenizers don't really handle words. Keep to offsets then.
+                start_index = enc.offsets[s][0]
+                end_index = enc.offsets[e][1]
+        else:
+            start_index = enc.offsets[s][0]
+            end_index = enc.offsets[e][1]
+        return start_index, end_index
+
     def decode(
         self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
     ) -> Tuple:
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index c3a0da2f2b5e9a..001254aa94b01e 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -171,6 +171,29 @@ def ensure_large_logits_postprocess(
 
         self.assertEqual(nested_simplify(outputs), {"score": 0.028, "start": 0, "end": 11, "answer": "HuggingFace"})
 
+    @slow
+    @require_torch
+    def test_small_model_japanese(self):
+        question_answerer = pipeline(
+            "question-answering",
+            model="KoichiYasuoka/deberta-base-japanese-aozora-ud-head",
+        )
+        output = question_answerer(question="国語", context="全学年にわたって小学校の国語の教科書に挿し絵が用いられている")
+
+        # Wrong answer, the whole text is identified as one "word" since the tokenizer does not include
+        # a pretokenizer
+        self.assertEqual(
+            nested_simplify(output),
+            {"score": 1.0, "start": 0, "end": 30, "answer": "全学年にわたって小学校の国語の教科書に挿し絵が用いられている"},
+        )
+
+        # Disable word alignment
+        output = question_answerer(question="国語", context="全学年にわたって小学校の国語の教科書に挿し絵が用いられている", align_to_words=False)
+        self.assertEqual(
+            nested_simplify(output),
+            {"score": 1.0, "start": 15, "end": 18, "answer": "教科書"},
+        )
+
     @slow
     @require_torch
     def test_small_model_long_context_cls_slow(self):

From fdd9c95c0bd90cce0d4ba3099e4bb597ad9a76cd Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Tue, 9 Aug 2022 09:58:11 -0700
Subject: [PATCH 093/162] =?UTF-8?q?=20=F0=9F=93=9D=20update=20metric=20wit?=
 =?UTF-8?q?h=20evaluate=20(#18535)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/source/en/training.mdx | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/training.mdx b/docs/source/en/training.mdx
index 6b0ec8a4081050..9222d27ac81f6e 100644
--- a/docs/source/en/training.mdx
+++ b/docs/source/en/training.mdx
@@ -98,18 +98,18 @@ Specify where to save the checkpoints from your training:
 >>> training_args = TrainingArguments(output_dir="test_trainer")
 ```
 
-### Metrics
+### Evaluate
 
-[`Trainer`] does not automatically evaluate model performance during training. You will need to pass [`Trainer`] a function to compute and report metrics. The 🤗 Datasets library provides a simple [`accuracy`](https://huggingface.co/metrics/accuracy) function you can load with the `load_metric` (see this [tutorial](https://huggingface.co/docs/datasets/metrics.html) for more information) function:
+[`Trainer`] does not automatically evaluate model performance during training. You'll need to pass [`Trainer`] a function to compute and report metrics. The [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library provides a simple [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) function you can load with the [`evaluate.load`] (see this [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) for more information) function:
 
 ```py
 >>> import numpy as np
->>> from datasets import load_metric
+>>> import evaluate
 
->>> metric = load_metric("accuracy")
+>>> metric = evaluate.load("accuracy")
 ```
 
-Call `compute` on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits):
+Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits):
 
 ```py
 >>> def compute_metrics(eval_pred):
@@ -341,12 +341,14 @@ To keep track of your training progress, use the [tqdm](https://tqdm.github.io/)
 ...         progress_bar.update(1)
 ```
 
-### Metrics
+### Evaluate
 
-Just like how you need to add an evaluation function to [`Trainer`], you need to do the same when you write your own training loop. But instead of calculating and reporting the metric at the end of each epoch, this time you will accumulate all the batches with [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) and calculate the metric at the very end.
+Just like how you added an evaluation function to [`Trainer`], you need to do the same when you write your own training loop. But instead of calculating and reporting the metric at the end of each epoch, this time you'll accumulate all the batches with [`~evaluate.add_batch`] and calculate the metric at the very end.
 
 ```py
->>> metric = load_metric("accuracy")
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
 >>> model.eval()
 >>> for batch in eval_dataloader:
 ...     batch = {k: v.to(device) for k, v in batch.items()}

From 5e8a3d44a1b323d7f885871ce2f5c5e392cc39c7 Mon Sep 17 00:00:00 2001
From: YouJiacheng <1503679330@qq.com>
Date: Wed, 10 Aug 2022 02:23:30 +0800
Subject: [PATCH 094/162] Restore _init_weights value in no_init_weights
 (#18504)

* Recover _init_weights value in no_init_weights

For potential nested use.
In addition, users might modify private no_init_weights as well.

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Remove private variable change check

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/modeling_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 8bce35f9e336d9..78c012ec095fdb 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -106,12 +106,13 @@ def no_init_weights(_enable=True):
     TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
     """
     global _init_weights
+    old_init_weights = _init_weights
     if _enable:
         _init_weights = False
     try:
         yield
     finally:
-        _init_weights = True
+        _init_weights = old_init_weights
 
 
 try:

From ba982711b1af05f071ca4dd796e845ba19b80fe0 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Tue, 9 Aug 2022 15:15:01 -0400
Subject: [PATCH 095/162] Clean up comment

---
 src/transformers/utils/hub.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 570ff52e707532..07164e735db901 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -952,7 +952,6 @@ def move_to_new_cache(file, repo, filename, revision, etag, commit_hash):
 
     # blobs
     os.makedirs(os.path.join(repo, "blobs"), exist_ok=True)
-    # TODO: replace copy by move when all works well.
     blob_path = os.path.join(repo, "blobs", etag)
     shutil.move(file, blob_path)
 

From 3a70590f3d9add5223dadad93c33bc629d838078 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Tue, 9 Aug 2022 16:22:55 -0700
Subject: [PATCH 096/162] =?UTF-8?q?=20=F0=9F=93=9D=20update=20documentatio?=
 =?UTF-8?q?n=20build=20section=20(#18548)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/source/en/pr_checks.mdx | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/pr_checks.mdx b/docs/source/en/pr_checks.mdx
index 57e0766c7f6776..8b562b62b29c57 100644
--- a/docs/source/en/pr_checks.mdx
+++ b/docs/source/en/pr_checks.mdx
@@ -65,13 +65,9 @@ Just in case anything slipped through the cracks, the full test suite is also ru
 
 ## Documentation build
 
-The job `ci/circleci: build_doc` runs a build of the documentation just to make sure everything will be okay once your PR is merged. If that steps fails, you can inspect it locally by going into the `docs` folder of the Transformers repo and then typing
+The `build_pr_documentation` job builds and generates a preview of the documentation to make sure everything looks okay once your PR is merged. A bot will add a link to preview the documentation in your PR. Any changes you make to the PR are automatically updated in the preview. If the documentation fails to build, click on **Details** next to the failed job to see where things went wrong. Often, the error is as simple as a missing file in the `toctree`.
 
-```bash
-make html
-```
-
-Sphinx is not known for its helpful error messages, so you might have to try a few things to really find the source of the error.
+If you're interested in building or previewing the documentation locally, take a look at the [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) in the docs folder.
 
 ## Code and documentation style
 

From 09f36ba1855463d78a0c2f00af56520a2a545863 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 10 Aug 2022 09:13:36 +0200
Subject: [PATCH 097/162] `bitsandbytes` - `Linear8bitLt` integration into
 `transformers` models (#17901)

* first commit

* correct replace function

* add final changes

- works like charm!
- cannot implement tests yet
- tested

* clean up a bit

* add bitsandbytes dependencies

* working version

- added import function
- added bitsandbytes utils file

* small fix

* small fix

- fix import issue

* fix import issues

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* refactor a bit

- move bitsandbytes utils to utils
- change comments on functions

* reformat docstring

- reformat docstring on init_empty_weights_8bit

* Update src/transformers/__init__.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* revert bad formatting

* change to bitsandbytes

* refactor a bit

- remove init8bit since it is useless

* more refactoring

- fixed init empty weights issue
- added threshold param

* small hack to make it work

* Update src/transformers/modeling_utils.py

* Update src/transformers/modeling_utils.py

* revmoe the small hack

* modify utils file

* make style + refactor a bit

* create correctly device map

* add correct dtype for device map creation

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* apply suggestions

- remove with torch.grad
- do not rely on Python bool magic!

* add docstring

 - add docstring for new kwargs

* add docstring

- comment `replace_8bit_linear` function
- fix weird formatting

* - added more documentation
- added new utility function for memory footprint tracking
- colab demo to add

* few modifs

- typo doc
- force cast into float16 when load_in_8bit is enabled

* added colab link

* add test architecture + docstring a bit

* refactor a bit testing class

* make style + refactor a bit

* enhance checks

- add more checks
- start writing saving test

* clean up a bit

* male style

* add more details on doc

* add more tests

- still needs to fix 2 tests

* replace by "or"

- could not fix it from GitHub GUI

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* refactor a bit testing code + add readme

* make style

* fix import issue

* Update src/transformers/modeling_utils.py

Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>

* add few comments

* add more doctring + make style

* more docstring

* raise error when loaded in 8bit

* make style

* add warning if loaded on CPU

* add small sanity check

* fix small comment

* add bitsandbytes on dockerfile

* Improve documentation

- improve documentation from comments

* add few comments

* slow tests pass on the VM but not on the CI VM

* Fix merge conflict

* make style

* another test should pass on a multi gpu setup

* fix bad import in testing file

* Fix slow tests

- remove dummy batches
- no more CUDA illegal memory errors

* odify dockerfile

* Update docs/source/en/main_classes/model.mdx

* Update Dockerfile

* Update model.mdx

* Update Dockerfile

* Apply suggestions from code review

* few modifications

- lm head can stay on disk/cpu
- change model name so that test pass

* change test value

- change test value to the correct output
- torch bmm changed to baddmm in bloom modeling when merging

* modify installation guidelines

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* replace `n`by `name`

* merge `load_in_8bit` and `low_cpu_mem_usage`

* first try - keep the lm head in full precision

* better check

- check the attribute `base_model_prefix` instead of computing the number of parameters

* added more tests

* Update src/transformers/utils/bitsandbytes.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Merge branch 'integration-8bit' of https://github.com/younesbelkada/transformers into integration-8bit

* improve documentation

- fix typos for installation
- change title in the documentation

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>
---
 docker/transformers-all-latest-gpu/Dockerfile |   3 +
 docs/source/en/main_classes/model.mdx         |  41 +++-
 src/transformers/__init__.py                  |   1 +
 src/transformers/modeling_utils.py            | 102 ++++++++-
 src/transformers/utils/bitsandbytes.py        | 142 ++++++++++++
 tests/mixed_int8/README.md                    |  37 +++
 tests/mixed_int8/__init__.py                  |   0
 tests/mixed_int8/test_mixed_int8.py           | 215 ++++++++++++++++++
 utils/tests_fetcher.py                        |   1 +
 9 files changed, 534 insertions(+), 8 deletions(-)
 create mode 100644 src/transformers/utils/bitsandbytes.py
 create mode 100644 tests/mixed_int8/README.md
 create mode 100644 tests/mixed_int8/__init__.py
 create mode 100644 tests/mixed_int8/test_mixed_int8.py

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index e97a91f4246fb4..b0a55ba8be946b 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -45,6 +45,9 @@ RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
+# Add bitsandbytes for mixed int8 testing
+RUN python3 -m pip install -i https://test.pypi.org/simple/ bitsandbytes==0.31.5
+
 RUN python3 -m pip install --no-cache-dir decord
 
 # When installing in editable mode, `transformers` is not recognized as a package.
diff --git a/docs/source/en/main_classes/model.mdx b/docs/source/en/main_classes/model.mdx
index c59af2d2214814..10f81e55d74506 100644
--- a/docs/source/en/main_classes/model.mdx
+++ b/docs/source/en/main_classes/model.mdx
@@ -105,7 +105,7 @@ You can also write your own device map following the same format (a dictionary l
 device_map = {"shared": 0, "encoder": 0, "decoder": 1, "lm_head": 1}
 ```
 
-Another way to minimize the memory impact of your model is to instantiate it at a lower precision dtype (like `torch.float16`).
+Another way to minimize the memory impact of your model is to instantiate it at a lower precision dtype (like `torch.float16`) or use direct quantization techniques as described below.
 
 ### Model Instantiation dtype
 
@@ -133,6 +133,45 @@ model = AutoModel.from_config(config)
 
 Due to Pytorch design, this functionality is only available for floating dtypes.
 
+### `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
+
+From the paper `GPT3.int8() : 8-bit Matrix Multiplication for Transformers at Scale`, we suport HuggingFace 🤗  integration for all models in the Hub with few lines of code. 
+For models trained in  half-precision (aka, either `float16` or `bfloat16`) or full precision. This method aims to reduce `nn.Linear` size by 2 (if trained in half precision) or by 4 if trained in full precision, without affecting too much quality by operating on the outliers in half-precision.
+This technique is useful and works well for billion scale models (>1B parameters) therefore we advice you to use it only for models of that scale. This method has been tested for 2-billion to 176-billion scale models and supports only PyTorch models. 
+
+![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
+
+Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models (>=176B parameters).
+Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but there are some exceptional systematic outliers that are very differently distributed for large models. These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6, but a lower threshold might be needed for more unstable models (small models, fine-tuning).
+
+Note also that you would require a GPU to run mixed-8bit models as the kernels has been compiled for GPUs only. Make sure that you have enough GPU RAM to store the quarter (or half if your model is natively in half precision) of the model before using this feature. 
+
+Below are some notes to help you use this module, or follow this demo on Google colab: [![Open In Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
+
+#### Requirements
+
+- Make sure you run that on a NVIDIA GPU that supports 8-bit tensor cores (Turing or Ampere GPUs - e.g. T4, RTX20s RTX30s, A40-A100). Note that previous generations of NVIDIA GPUs do not support 8-bit tensor cores.
+- Install the correct version of `bitsandbytes` by running:
+`pip install -i https://test.pypi.org/simple/ bitsandbytes`
+- Install `accelerate`:
+`pip install accelerate`
+
+#### Running mixed-int8 models
+
+After carefully installing the required libraries, the way to load your mixed 8-bit model is as follows:
+```py
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+The implementation supports multi-GPU setup thanks to `accelerate` as backend. If you want to control the GPU memory you want to allocate for each GPU, you can use the `max_memory` argument as follows:
+(If allocating `1GB` into GPU-0 and `2GB` into GPU-1, you can use `max_memory={0:"1GB", 1:"2GB"}`)
+```py
+max_memory_mapping = {0: "1GB", 1: "2GB"}
+model_name = "bigscience/bloom-3b"
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
+)
+```
 
 
 ## ModuleUtilsMixin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d56c70c024e6ed..28ec6a17b3a323 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -462,6 +462,7 @@
         "is_vision_available",
         "logging",
     ],
+    "utils.bitsandbytes": [],
 }
 
 # sentencepiece-backed objects
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 78c012ec095fdb..1d895baecfedac 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -61,6 +61,7 @@
     copy_func,
     has_file,
     is_accelerate_available,
+    is_bitsandbytes_available,
     is_offline_mode,
     logging,
     replace_return_docstrings,
@@ -83,6 +84,9 @@
     else:
         get_balanced_memory = None
 
+if is_bitsandbytes_available():
+    from .utils.bitsandbytes import get_key_to_not_convert, replace_8bit_linear, set_module_8bit_tensor_to_device
+
 logger = logging.get_logger(__name__)
 
 
@@ -501,6 +505,7 @@ def _load_state_dict_into_meta_model(
     state_dict_folder=None,
     state_dict_index=None,
     dtype=None,
+    load_in_8bit=False,
 ):
     """
     This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its
@@ -561,13 +566,14 @@ def _load_state_dict_into_meta_model(
                 # TODO: group all errors and raise at the end.
                 raise ValueError(f"{param_name} doesn't have any device set.")
             param_device = device_map[module_name]
-
         if param_device == "disk":
             offload_index = offload_weight(param, param_name, offload_folder, offload_index)
         elif param_device == "cpu" and state_dict_index is not None:
             state_dict_index = offload_weight(param, param_name, state_dict_folder, state_dict_index)
-        else:
+        elif not load_in_8bit:
             set_module_tensor_to_device(model, param_name, param_device, value=param)
+        else:
+            set_module_8bit_tensor_to_device(model, param_name, param_device, value=param)
 
     return error_msgs, offload_index, state_dict_index
 
@@ -1578,6 +1584,24 @@ def save_pretrained(
                 save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
             )
 
+    def get_memory_footprint(self, return_buffers=True):
+        r"""
+        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
+        Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
+        PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
+
+        Arguments:
+            return_buffers (`bool`, *optional*, defaults to `True`):
+                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
+                are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
+                norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
+        """
+        mem = sum([param.nelement() * param.element_size() for param in self.parameters()])
+        if return_buffers:
+            mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()])
+            mem = mem + mem_bufs
+        return mem
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
         r"""
@@ -1707,6 +1731,22 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU
                 RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to
                 `True` when there is some disk offload.
+            load_in_8bit (`bool`, *optional*, defaults to `False`):
+                If `True`, will convert the loaded model into mixed-8bit quantized model. To use this feature please
+                install `bitsandbytes` compiled with your CUDA version by running `pip install -i
+                https://test.pypi.org/simple/ bitsandbytes-cudaXXX` where XXX is your CUDA version (e.g. 11.6 = 116).
+                Make also sure that you have enough GPU RAM to store half of the model size since the 8bit modules are
+                not compiled and adapted for CPUs.
+            int8_threshold (`float`, *optional*, defaults to 6):
+                Works together with `load_in_8bit`. This corresponds to the outlier threshold for outlier detection as
+                described in `GPT3.int8() : 8-bit Matrix Multiplication for Transformers at Scale` paper. Any hidden
+                states value that is above this threshold will be considered an outlier and the operation on those
+                values will be done in fp16. Values are usually normally distributed, that is, most values are in the
+                range [-3.5, 3.5], but there are some exceptional systematic outliers that are very differently
+                distributed for large models. These outliers are often in the interval [-60, -6] or [6, 60]. Int8
+                quantization works well for values of magnitude ~5, but beyond that, there is a significant performance
+                penalty. A good default threshold is 6, but a lower threshold might be needed for more unstable models
+                (small models, fine-tuning).
             subfolder (`str`, *optional*, defaults to `""`):
                 In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                 specify the folder name here.
@@ -1796,7 +1836,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         device_map = kwargs.pop("device_map", None)
         max_memory = kwargs.pop("max_memory", None)
         offload_folder = kwargs.pop("offload_folder", None)
-        offload_state_dict = kwargs.pop("offload_state_dict", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        load_in_8bit = kwargs.pop("load_in_8bit", False)
+        int8_threshold = kwargs.pop("int8_threshold", 6.0)
         subfolder = kwargs.pop("subfolder", "")
 
         if trust_remote_code is True:
@@ -1804,7 +1846,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
                 " ignored."
             )
-
         if device_map is not None:
             if low_cpu_mem_usage is None:
                 low_cpu_mem_usage = True
@@ -1824,6 +1865,28 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     "Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`"
                 )
 
+        if load_in_8bit:
+            if not (is_accelerate_available() and is_bitsandbytes_available()):
+                raise ImportError(
+                    "Using `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of"
+                    " bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or"
+                    " pip install bitsandbytes` "
+                )
+            if torch_dtype == "auto" or torch_dtype != torch.float16:
+                # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
+                torch_dtype = torch.float16
+                logger.info("Loading the model in mixed int8 - forcing the weights to be casted in float16")
+            if device_map is None:
+                raise ValueError(
+                    "A device map needs to be passed to run convert models into mixed-int8 format. Please run"
+                    "`.from_pretrained` with `device_map='auto'`"
+                )
+            if from_tf or from_flax:
+                raise ValueError(
+                    "Converting into mixed 8-bit weights from tf/flax weights is currently not supported, please make"
+                    " sure the weights are in PyTorch format."
+                )
+
         from_pt = not (from_tf | from_flax)
 
         user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
@@ -2063,12 +2126,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config())] + init_contexts
-        elif low_cpu_mem_usage:
+        elif load_in_8bit or low_cpu_mem_usage:
             init_contexts.append(init_empty_weights())
 
         with ContextManagers(init_contexts):
             model = cls(config, *model_args, **model_kwargs)
 
+        if load_in_8bit:
+            logger.info("Detected 8-bit loading: activating 8-bit loading for this model")
+
+            # We never convert lm_head or any last modules for numerical stability reasons
+            modules_to_not_convert = get_key_to_not_convert(model)
+            model = replace_8bit_linear(model, threshold=int8_threshold, modules_to_not_convert=modules_to_not_convert)
+
         if isinstance(device_map, str):
             if model._no_split_modules is None:
                 raise ValueError(f"{model.__class__.__name__} does not support `device_map='{device_map}'` yet.")
@@ -2091,9 +2161,21 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             # Make sure tied weights are tied before creating the device map.
             model.tie_weights()
             device_map = infer_auto_device_map(
-                model, no_split_module_classes=no_split_modules, dtype=torch_dtype, max_memory=max_memory
+                model,
+                no_split_module_classes=no_split_modules,
+                dtype=torch_dtype if not load_in_8bit else torch.int8,
+                max_memory=max_memory,
             )
 
+            if load_in_8bit:
+                # The LM head can stay on disk / CPU
+                device_map_without_lm_head = {
+                    key: device_map[key] for key in device_map.keys() if key != modules_to_not_convert
+                }
+                if "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
+                    raise ValueError("8-bit operations on `bitsandbytes` are not supported under CPU!")
+                del device_map_without_lm_head
+
         if from_tf:
             if resolved_archive_file.endswith(".index"):
                 # Load from a TensorFlow 1.X checkpoint - provided by original authors
@@ -2145,6 +2227,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 offload_folder=offload_folder,
                 offload_state_dict=offload_state_dict,
                 dtype=torch_dtype,
+                load_in_8bit=load_in_8bit,
             )
 
         # make sure token embedding weights are still tied if needed
@@ -2185,6 +2268,7 @@ def _load_pretrained_model(
         offload_folder=None,
         offload_state_dict=None,
         dtype=None,
+        load_in_8bit=False,
     ):
         if device_map is not None and "disk" in device_map.values():
             if offload_folder is None:
@@ -2250,7 +2334,10 @@ def _fix_key(key):
                     key = ".".join(key.split(".")[1:])
                 param = model_state_dict[key]
                 if param.device == torch.device("meta"):
-                    set_module_tensor_to_device(model, key, "cpu", torch.empty(*param.size()))
+                    if not load_in_8bit:
+                        set_module_tensor_to_device(model, key, "cpu", torch.empty(*param.size()))
+                    else:
+                        set_module_8bit_tensor_to_device(model, key, "cpu", torch.empty(*param.size()))
 
         # retrieve unintialized modules and initialize before maybe overriding that with the pretrained weights.
         if _fast_init:
@@ -2359,6 +2446,7 @@ def _find_mismatched_keys(
                         state_dict_folder=state_dict_folder,
                         state_dict_index=state_dict_index,
                         dtype=dtype,
+                        load_in_8bit=load_in_8bit,
                     )
                     error_msgs += new_error_msgs
                 else:
diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
new file mode 100644
index 00000000000000..ee4e52d421fd09
--- /dev/null
+++ b/src/transformers/utils/bitsandbytes.py
@@ -0,0 +1,142 @@
+from transformers.utils import is_accelerate_available, is_bitsandbytes_available
+
+
+if is_bitsandbytes_available():
+    import torch
+    import torch.nn as nn
+
+    import bitsandbytes as bnb
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+
+def set_module_8bit_tensor_to_device(module, tensor_name, device, value=None):
+    """
+    A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
+    `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
+    function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the
+    class `Int8Params` from `bitsandbytes`.
+
+    Args:
+        module (`torch.nn.Module`):
+            The module in which the tensor we want to move lives.
+        tensor_name (`str`):
+            The full name of the parameter/buffer.
+        device (`int`, `str` or `torch.device`):
+            The device on which to set the tensor.
+        value (`torch.Tensor`, *optional*):
+            The value of the tensor (useful when going from the meta device to any other device).
+    """
+    # Recurse if needed
+    if "." in tensor_name:
+        splits = tensor_name.split(".")
+        for split in splits[:-1]:
+            new_module = getattr(module, split)
+            if new_module is None:
+                raise ValueError(f"{module} has no attribute {split}.")
+            module = new_module
+        tensor_name = splits[-1]
+
+    if tensor_name not in module._parameters and tensor_name not in module._buffers:
+        raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+    is_buffer = tensor_name in module._buffers
+    old_value = getattr(module, tensor_name)
+
+    if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
+        raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
+
+    if is_buffer:
+        has_fp16_weights = None
+    else:
+        has_fp16_weights = getattr(module._parameters[tensor_name], "has_fp16_weights", None)
+
+    if has_fp16_weights is not None:
+        param = module._parameters[tensor_name]
+        if param.device.type != "cuda":
+            if value is None:
+                new_value = old_value.to(device)
+            elif isinstance(value, torch.Tensor):
+                new_value = value.to("cpu")
+                if value.dtype == torch.int8:
+                    raise ValueError(
+                        "You cannot load weights that are saved in int8 using `load_in_8bit=True`, make sure you are",
+                        " using `load_in_8bit=True` on float32/float16/bfloat16 weights.",
+                    )
+            else:
+                new_value = torch.tensor(value, device="cpu")
+            new_value = bnb.nn.Int8Params(new_value, requires_grad=False, has_fp16_weights=has_fp16_weights).to(device)
+            module._parameters[tensor_name] = new_value
+    else:
+        if value is None:
+            new_value = old_value.to(device)
+        elif isinstance(value, torch.Tensor):
+            new_value = value.to(device)
+        else:
+            new_value = torch.tensor(value, device=device)
+
+        if is_buffer:
+            module._buffers[tensor_name] = new_value
+        else:
+            new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad)
+            module._parameters[tensor_name] = new_value
+
+
+def replace_8bit_linear(model, threshold=6.0, modules_to_not_convert="lm_head"):
+    """
+    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
+    library. This will enable running your models using mixed int8 precision as described by the paper `GPT3.int8():
+    8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
+    version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
+    bitsandbytes`
+
+    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
+    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
+    CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
+    matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
+    (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
+    predictive degradation is possible for very large models (>=176B parameters).
+
+    Parameters:
+        model (`torch.nn.Module`):
+            Input model or `torch.nn.Module` as the function is run recursively.
+        threshold (`float`, *optional*, defaults to 6.0):
+            `int8_threshold` for outlier detection as described in the formentioned paper. This parameters is set to
+            `6.0` as described by the paper.
+        modules_to_not_convert (`str`, *optional*, defaults to `lm_head`):
+            Name of the module to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
+            for numerical stability reasons.
+    """
+    for name, module in model.named_children():
+        if len(list(module.children())) > 0:
+            replace_8bit_linear(module, threshold, modules_to_not_convert)
+
+        if isinstance(module, nn.Linear) and name != modules_to_not_convert:
+            with init_empty_weights():
+                model._modules[name] = bnb.nn.Linear8bitLt(
+                    module.in_features,
+                    module.out_features,
+                    module.bias is not None,
+                    has_fp16_weights=False,
+                    threshold=threshold,
+                )
+    return model
+
+
+def get_key_to_not_convert(model):
+    r"""
+    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
+    we may want to keep the lm_head in full precision for numerical stability reasons.
+
+    Parameters:
+    model (`torch.nn.Module`):
+        Input model
+    """
+    # Ignore this for base models (BertModel, GPT2Model, etc.)
+    if not hasattr(model, model.base_model_prefix):
+        return ""
+
+    # otherwise they have an attached head
+    list_modules = list(model.named_parameters())
+    last_name = list_modules[-1][0]
+    return last_name.split(".")[0]
diff --git a/tests/mixed_int8/README.md b/tests/mixed_int8/README.md
new file mode 100644
index 00000000000000..c0173bed7a6b7a
--- /dev/null
+++ b/tests/mixed_int8/README.md
@@ -0,0 +1,37 @@
+# Testing mixed int8 quantization
+
+## Hardware requirements
+
+I am using a setup of 2 GPUs that are NVIDIA-Tesla T4 15GB
+
+## Virutal envs
+
+```conda create --name int8-testing python==3.8```
+```git clone https://github.com/younesbelkada/transformers.git && git checkout integration-8bit```
+```pip install -e ".[dev]"```
+```pip install -i https://test.pypi.org/simple/ bitsandbytes```
+```pip install git+https://github.com/huggingface/accelerate.git@e0212893ea6098cc0a7a3c7a6eb286a9104214c1```
+
+
+## Trobleshooting
+
+```conda create --name int8-testing python==3.8```
+```pip install -i https://test.pypi.org/simple/ bitsandbytes```
+```conda install pytorch torchvision torchaudio -c pytorch```
+```git clone https://github.com/younesbelkada/transformers.git && git checkout integration-8bit```
+```pip install -e ".[dev]"```
+```pip install git+https://github.com/huggingface/accelerate.git@b52b793ea8bac108ba61192eead3cf11ca02433d```
+
+### Check driver settings:
+
+```
+nvcc --version
+```
+
+```
+ls -l $CONDA_PREFIX/lib/libcudart.so
+```
+
+### Recurrent bugs
+
+Sometimes you have to run a "dummy" inference pass when dealing with a multi-GPU setup. Checkout the ```test_multi_gpu_loading``` and the ```test_pipeline``` functions.
\ No newline at end of file
diff --git a/tests/mixed_int8/__init__.py b/tests/mixed_int8/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/mixed_int8/test_mixed_int8.py b/tests/mixed_int8/test_mixed_int8.py
new file mode 100644
index 00000000000000..0cd7ca16411c19
--- /dev/null
+++ b/tests/mixed_int8/test_mixed_int8.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import unittest
+
+from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from transformers.testing_utils import (
+    is_torch_available,
+    require_accelerate,
+    require_bitsandbytes,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu
+@slow
+class BaseMixedInt8Test(unittest.TestCase):
+    # We keep the constants inside the init function and model loading inside setUp function
+
+    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
+    # Therefore here we use only bloom-1b3 to test our module
+    model_name = "bigscience/bloom-1b7"
+
+    # Constant values
+    EXPECTED_RELATIVE_DIFFERENCE = (
+        1.540025  # This was obtained on a Quadro RTX 8000 so the number might slightly change
+    )
+
+    input_text = "Hello my name is"
+    EXPECTED_OUTPUT = "Hello my name is John.\nI am a friend of the family.\n"
+    MAX_NEW_TOKENS = 10
+
+    def setUp(self):
+        # Models and tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+
+class MixedInt8Test(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+
+        # Models and tokenizer
+        self.model_fp16 = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", device_map="auto")
+        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.model_fp16
+        del self.model_8bit
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_memory_footprint(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model and the class type of the linear layers of the converted models
+        """
+        from bitsandbytes.nn import Int8Params
+
+        mem_fp16 = self.model_fp16.get_memory_footprint()
+        mem_8bit = self.model_8bit.get_memory_footprint()
+
+        self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE)
+        self.assertTrue(self.model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+
+    def test_generate_quality(self):
+        r"""
+        Test the generation quality of the quantized model and see that we are matching the expected output.
+        Given that we are operating on small numbers + the testing model is relatively small, we might not get
+        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
+        """
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+
+class MixedInt8ModelClassesTest(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+        # model_name
+        self.model_name = "bigscience/bloom-560m"
+        # Models and tokenizer
+        self.base_model = AutoModel.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name, load_in_8bit=True, device_map="auto"
+        )
+        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.base_model
+        del self.sequence_model
+        del self.model_8bit
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_correct_head_class(self):
+        r"""
+        A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
+        are kept in their native class.
+        """
+        from bitsandbytes.nn import Int8Params
+
+        # last param of a base model should be a linear8bit module
+        self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+
+        # Other heads should be nn.Parameter
+        self.assertTrue(self.model_8bit.lm_head.weight.__class__ == torch.nn.Parameter)
+        self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
+
+
+class MixedInt8TestPipeline(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.pipe
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_pipeline(self):
+        r"""
+        The aim of this test is to verify that the mixed int8 is compatible with `pipeline` from transformers. Since
+        we used pipline for inference speed benchmarking we want to make sure that this feature does not break anything
+        on pipline.
+        """
+        # self._clear_cuda_cache()
+        self.pipe = pipeline(
+            "text-generation",
+            model=self.model_name,
+            model_kwargs={"device_map": "auto", "load_in_8bit": True},
+            max_new_tokens=self.MAX_NEW_TOKENS,
+        )
+
+        # Real second forward pass
+        pipeline_output = self.pipe(self.input_text)
+        self.assertEqual(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUT)
+
+
+@require_torch_multi_gpu
+class MixedInt8TestMultiGpu(BaseMixedInt8Test):
+    def setUp(self):
+        super().setUp()
+
+    def test_multi_gpu_loading(self):
+        r"""
+        This tests that the model has been loaded and can be used correctly on a multi-GPU setup.
+        Let's just try to load a model on 2 GPUs and see if it works. The model we test has ~2GB of total, 3GB should suffice
+        """
+
+        memory_mapping = {0: "1GB", 1: "2GB"}
+        model_parallel = AutoModelForCausalLM.from_pretrained(
+            self.model_name, load_in_8bit=True, max_memory=memory_mapping, device_map="auto"
+        )
+
+        def get_list_devices(model):
+            list_devices = []
+            for _, module in model.named_children():
+                if len(list(module.children())) > 0:
+                    list_devices.extend(get_list_devices(module))
+                else:
+                    # Do a try except since we can encounter Dropout modules that does not
+                    # have any device set
+                    try:
+                        list_devices.append(next(module.parameters()).device.index)
+                    except BaseException:
+                        continue
+            return list_devices
+
+        list_devices = get_list_devices(model_parallel)
+        # Check that we have dispatched the model into 2 separate devices
+        self.assertTrue((1 in list_devices) and (0 in list_devices))
+
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # Second real batch
+        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        self.assertEqual(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index e8c171f2187901..c02f7155cce673 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -466,6 +466,7 @@ def module_to_test_file(module_fname):
     "tests/sagemaker/test_single_node_gpu.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_model_parallel.py",  # SageMaker test
     "tests/sagemaker/test_multi_node_data_parallel.py",  # SageMaker test
+    "tests/mixed_int8/test_mixed_int8.py",  # Mixed-int8 bitsandbytes test
 ]
 
 

From ca3833e2836bdbc7ef5445170d20ee40a5b635c7 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 10 Aug 2022 12:57:21 +0100
Subject: [PATCH 098/162] TF: XLA-trainable DeBERTa v2 (#18546)

* fix deberta issues

* add different code paths for gpu and tpu

* shorter gpu take along axis

* Stable Dropout without tf cond

* variable must be float
---
 .../models/deberta/modeling_tf_deberta.py     | 49 +++++++-------
 .../deberta_v2/modeling_tf_deberta_v2.py      | 67 ++++++++++---------
 2 files changed, 62 insertions(+), 54 deletions(-)

diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 1d8c01e24acda0..edb9b2b0855532 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -101,27 +101,6 @@ def call(self, inputs: tf.Tensor, mask: tf.Tensor):
         return output
 
 
-def get_mask(input, dropout):
-    mask = tf.cast(
-        1 - tf.compat.v1.distributions.Bernoulli(probs=1 - dropout).sample(sample_shape=shape_list(input)), tf.bool
-    )
-    return mask, dropout
-
-
-@tf.custom_gradient
-def TFDebertaXDropout(input, local_ctx):
-    mask, dropout = get_mask(input, local_ctx)
-    scale = tf.convert_to_tensor(1.0 / (1 - dropout), dtype=tf.float32)
-    input = tf.cond(dropout > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input)
-
-    def custom_grad(upstream_grad):
-        return tf.cond(
-            scale > 1, lambda: (tf.where(mask, 0.0, upstream_grad) * scale, None), lambda: (upstream_grad, None)
-        )
-
-    return input, custom_grad
-
-
 class TFDebertaStableDropout(tf.keras.layers.Layer):
     """
     Optimized dropout module for stabilizing the training
@@ -132,11 +111,33 @@ class TFDebertaStableDropout(tf.keras.layers.Layer):
 
     def __init__(self, drop_prob, **kwargs):
         super().__init__(**kwargs)
-        self.drop_prob = tf.convert_to_tensor(drop_prob, dtype=tf.float32)
+        self.drop_prob = drop_prob
+
+    @tf.custom_gradient
+    def xdropout(self, inputs):
+        """
+        Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
+        """
+        mask = tf.cast(
+            1
+            - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)),
+            tf.bool,
+        )
+        scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32)
+        if self.drop_prob > 0:
+            inputs = tf.where(mask, 0.0, inputs) * scale
+
+        def grad(upstream):
+            if self.drop_prob > 0:
+                return tf.where(mask, 0.0, upstream) * scale
+            else:
+                return upstream
+
+        return inputs, grad
 
     def call(self, inputs: tf.Tensor, training: tf.Tensor = False):
-        if training and self.drop_prob > 0:
-            return TFDebertaXDropout(inputs, self.drop_prob)
+        if training:
+            return self.xdropout(inputs)
         return inputs
 
 
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index aabb3b2d380ea1..fa9a202427e5e9 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -102,29 +102,6 @@ def call(self, inputs: tf.Tensor, mask: tf.Tensor):
         return output
 
 
-# Copied from transformers.models.deberta.modeling_tf_deberta.get_mask
-def get_mask(input, dropout):
-    mask = tf.cast(
-        1 - tf.compat.v1.distributions.Bernoulli(probs=1 - dropout).sample(sample_shape=shape_list(input)), tf.bool
-    )
-    return mask, dropout
-
-
-@tf.custom_gradient
-# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaXDropout
-def TFDebertaV2XDropout(input, local_ctx):
-    mask, dropout = get_mask(input, local_ctx)
-    scale = tf.convert_to_tensor(1.0 / (1 - dropout), dtype=tf.float32)
-    input = tf.cond(dropout > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input)
-
-    def custom_grad(upstream_grad):
-        return tf.cond(
-            scale > 1, lambda: (tf.where(mask, 0.0, upstream_grad) * scale, None), lambda: (upstream_grad, None)
-        )
-
-    return input, custom_grad
-
-
 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaStableDropout with Deberta->DebertaV2
 class TFDebertaV2StableDropout(tf.keras.layers.Layer):
     """
@@ -136,11 +113,33 @@ class TFDebertaV2StableDropout(tf.keras.layers.Layer):
 
     def __init__(self, drop_prob, **kwargs):
         super().__init__(**kwargs)
-        self.drop_prob = tf.convert_to_tensor(drop_prob, dtype=tf.float32)
+        self.drop_prob = drop_prob
+
+    @tf.custom_gradient
+    def xdropout(self, inputs):
+        """
+        Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
+        """
+        mask = tf.cast(
+            1
+            - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)),
+            tf.bool,
+        )
+        scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32)
+        if self.drop_prob > 0:
+            inputs = tf.where(mask, 0.0, inputs) * scale
+
+        def grad(upstream):
+            if self.drop_prob > 0:
+                return tf.where(mask, 0.0, upstream) * scale
+            else:
+                return upstream
+
+        return inputs, grad
 
     def call(self, inputs: tf.Tensor, training: tf.Tensor = False):
-        if training and self.drop_prob > 0:
-            return TFDebertaV2XDropout(inputs, self.drop_prob)
+        if training:
+            return self.xdropout(inputs)
         return inputs
 
 
@@ -525,10 +524,18 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer):
 def take_along_axis(x, indices):
     # Only a valid port of np.take_along_axis when the gather axis is -1
 
-    flat_x = tf.reshape(x, (-1, x.shape[-1]))
-    flat_indices = tf.reshape(indices, (-1, indices.shape[-1]))
-    gathered = tf.gather(flat_x, flat_indices, batch_dims=1)
-    gathered = tf.reshape(gathered, indices.shape)
+    # TPU + gathers and reshapes don't go along well -- see https://github.com/huggingface/transformers/issues/18239
+    if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy):
+        # [B, S, P] -> [B, S, P, D]
+        one_hot_indices = tf.one_hot(indices, depth=x.shape[-1], dtype=x.dtype)
+
+        # if we ignore the first two dims, this is equivalent to multiplying a matrix (one hot) by a vector (x)
+        # grossly abusing notation: [B, S, P, D] . [B, S, D] = [B, S, P]
+        gathered = tf.einsum("ijkl,ijl->ijk", one_hot_indices, x)
+
+    # GPUs, on the other hand, prefer gathers instead of large one-hot+matmuls
+    else:
+        gathered = tf.gather(x, indices, batch_dims=2)
 
     return gathered
 

From b84379c4618ab162049bdfb40c0214f1594a6c22 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 Aug 2022 08:00:18 -0400
Subject: [PATCH 099/162] Preserve hub-related kwargs in
 AutoModel.from_pretrained (#18545)

* Preserve hub-related kwargs in AutoModel.from_pretrained

* Fix tests

* Remove debug statement
---
 src/transformers/models/auto/auto_factory.py  | 29 +++++++++++++++----
 .../models/auto/configuration_auto.py         |  6 ++--
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index b7d8f66c339dd4..b412f14157f1c3 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -419,9 +419,24 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         config = kwargs.pop("config", None)
         trust_remote_code = kwargs.pop("trust_remote_code", False)
         kwargs["_from_auto"] = True
+        hub_kwargs_names = [
+            "cache_dir",
+            "force_download",
+            "local_files_only",
+            "proxies",
+            "resume_download",
+            "revision",
+            "subfolder",
+            "use_auth_token",
+        ]
+        hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
         if not isinstance(config, PretrainedConfig):
             config, kwargs = AutoConfig.from_pretrained(
-                pretrained_model_name_or_path, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **kwargs
+                pretrained_model_name_or_path,
+                return_unused_kwargs=True,
+                trust_remote_code=trust_remote_code,
+                **hub_kwargs,
+                **kwargs,
             )
         if hasattr(config, "auto_map") and cls.__name__ in config.auto_map:
             if not trust_remote_code:
@@ -430,7 +445,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
                     "the option `trust_remote_code=True` to remove this error."
                 )
-            if kwargs.get("revision", None) is None:
+            if hub_kwargs.get("revision", None) is None:
                 logger.warning(
                     "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
                     "no malicious code has been contributed in a newer revision."
@@ -438,12 +453,16 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             class_ref = config.auto_map[cls.__name__]
             module_file, class_name = class_ref.split(".")
             model_class = get_class_from_dynamic_module(
-                pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+                pretrained_model_name_or_path, module_file + ".py", class_name, **hub_kwargs, **kwargs
+            )
+            return model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )
-            return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
         elif type(config) in cls._model_mapping.keys():
             model_class = _get_model_class(config, cls._model_mapping)
-            return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+            return model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
         raise ValueError(
             f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
             f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index d8ecbb49e64f29..c65a2762a00029 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -728,7 +728,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         kwargs["_from_auto"] = True
         kwargs["name_or_path"] = pretrained_model_name_or_path
         trust_remote_code = kwargs.pop("trust_remote_code", False)
-        config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
         if "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]:
             if not trust_remote_code:
                 raise ValueError(
@@ -749,13 +749,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif "model_type" in config_dict:
             config_class = CONFIG_MAPPING[config_dict["model_type"]]
-            return config_class.from_dict(config_dict, **kwargs)
+            return config_class.from_dict(config_dict, **unused_kwargs)
         else:
             # Fallback: use pattern matching on the string.
             # We go from longer names to shorter names to catch roberta before bert (for instance)
             for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True):
                 if pattern in str(pretrained_model_name_or_path):
-                    return CONFIG_MAPPING[pattern].from_dict(config_dict, **kwargs)
+                    return CONFIG_MAPPING[pattern].from_dict(config_dict, **unused_kwargs)
 
         raise ValueError(
             f"Unrecognized model in {pretrained_model_name_or_path}. "

From 8d7065ec63c2b072975b12b7a78f0a1d335026fe Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:49:51 -0400
Subject: [PATCH 100/162] TF Examples Rewrite (#18451)

* Finished QA example

* Dodge a merge conflict

* Update text classification and LM examples

* Update NER example

* New Keras metrics WIP, fix NER example

* Update NER example

* Update MC, summarization and translation examples

* Add XLA warnings when shapes are variable

* Make sure batch_size is consistently scaled by num_replicas

* Add PushToHubCallback to all models

* Add docs links for KerasMetricCallback

* Add docs links for prepare_tf_dataset and jit_compile

* Correct inferred model names

* Don't assume the dataset has 'lang'

* Don't assume the dataset has 'lang'

* Write metrics in text classification

* Add 'framework' to TrainingArguments and TFTrainingArguments

* Export metrics in all examples and add tests

* Fix training args for Flax

* Update command line args for translation test

* make fixup

* Fix accidentally running other tests in fp16

* Remove do_train/do_eval from run_clm.py

* Remove do_train/do_eval from run_mlm.py

* Add tensorflow tests to circleci

* Fix circleci

* Update examples/tensorflow/language-modeling/run_mlm.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/test_tensorflow_examples.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/translation/run_translation.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/token-classification/run_ner.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Fix save path for tests

* Fix some model card kwargs

* Explain the magical -1000

* Actually enable tests this time

* Skip text classification PR until we fix shape inference

* make fixup

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 .circleci/config.yml                          |  67 ++++
 examples/tensorflow/_tests_requirements.txt   |  25 ++
 .../tensorflow/language-modeling/run_clm.py   | 159 ++++++---
 .../tensorflow/language-modeling/run_mlm.py   | 140 +++++---
 .../tensorflow/multiple-choice/run_swag.py    | 132 +++++---
 .../tensorflow/question-answering/run_qa.py   | 187 ++++++++---
 .../summarization/run_summarization.py        | 278 ++++++++-------
 .../tensorflow/test_tensorflow_examples.py    | 295 ++++++++++++++++
 .../text-classification/run_glue.py           | 135 +++++---
 .../run_text_classification.py                | 141 +++++---
 .../token-classification/run_ner.py           | 213 ++++++------
 .../tensorflow/translation/run_translation.py | 317 ++++++++++--------
 src/transformers/optimization_tf.py           |  16 +-
 src/transformers/training_args.py             |  43 +--
 src/transformers/training_args_tf.py          |   4 +-
 15 files changed, 1491 insertions(+), 661 deletions(-)
 create mode 100644 examples/tensorflow/_tests_requirements.txt
 create mode 100644 examples/tensorflow/test_tensorflow_examples.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 83ee65248e9cac..666505ab3b4389 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -658,6 +658,71 @@ jobs:
             - store_artifacts:
                   path: ~/transformers/reports
 
+    run_examples_tensorflow:
+        working_directory: ~/transformers
+        docker:
+            - image: cimg/python:3.7.12
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+            PYTEST_TIMEOUT: 120
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.5-tensorflow_examples-{{ checksum "setup.py" }}
+                      - v0.5-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tensorflow,sentencepiece,testing]
+            - run: pip install -r examples/tensorflow/_tests_requirements.txt
+            - save_cache:
+                  key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tensorflow_examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_examples_tensorflow_all:
+        working_directory: ~/transformers
+        docker:
+            - image: cimg/python:3.7.12
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+            PYTEST_TIMEOUT: 120
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.5-tensorflow_examples-{{ checksum "setup.py" }}
+                      - v0.5-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tensorflow,sentencepiece,testing]
+            - run: pip install -r examples/tensorflow/_tests_requirements.txt
+            - save_cache:
+                  key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tensorflow_examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
     run_examples_flax:
         working_directory: ~/transformers
         docker:
@@ -1000,6 +1065,7 @@ workflows:
             - check_code_quality
             - check_repository_consistency
             - run_examples_torch
+            - run_examples_tensorflow
             - run_examples_flax
             - run_tests_custom_tokenizers
             - run_tests_torch_and_tf
@@ -1022,6 +1088,7 @@ workflows:
                             - main
         jobs:
             - run_examples_torch_all
+            - run_examples_tensorflow_all
             - run_examples_flax_all
             - run_tests_torch_and_tf_all
             - run_tests_torch_and_flax_all
diff --git a/examples/tensorflow/_tests_requirements.txt b/examples/tensorflow/_tests_requirements.txt
new file mode 100644
index 00000000000000..37e37e35259176
--- /dev/null
+++ b/examples/tensorflow/_tests_requirements.txt
@@ -0,0 +1,25 @@
+tensorflow
+tensorboard
+scikit-learn
+seqeval
+psutil
+sacrebleu >= 1.4.12
+git+https://github.com/huggingface/accelerate@main#egg=accelerate
+rouge-score
+tensorflow_datasets
+matplotlib
+git-python==1.0.3
+faiss-cpu
+streamlit
+elasticsearch
+nltk
+pandas
+datasets >= 1.13.3
+fire
+pytest
+conllu
+sentencepiece != 0.1.92
+protobuf
+jiwer
+librosa
+evaluate >= 0.2.0
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index 3f12683d10d997..cbe2f54f22731d 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -22,6 +22,8 @@
 """
 # You can also adapt this script on your own clm task. Pointers for this are left as comments.
 
+import json
+
 # region Imports
 import logging
 import math
@@ -46,8 +48,8 @@
     TF_MODEL_FOR_CAUSAL_LM_MAPPING,
     AutoConfig,
     AutoTokenizer,
-    DefaultDataCollator,
     HfArgumentParser,
+    PushToHubCallback,
     TFAutoModelForCausalLM,
     TFTrainingArguments,
     create_optimizer,
@@ -205,21 +207,6 @@ def __post_init__(self):
                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 
 
-# endregion
-
-# region Helper classes
-class SavePretrainedCallback(tf.keras.callbacks.Callback):
-    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
-    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
-    # that saves the model with this method after each epoch.
-    def __init__(self, output_dir, **kwargs):
-        super().__init__()
-        self.output_dir = output_dir
-
-    def on_epoch_end(self, epoch, logs=None):
-        self.model.save_pretrained(self.output_dir)
-
-
 # endregion
 
 
@@ -299,6 +286,7 @@ def main():
         raw_datasets = load_dataset(
             data_args.dataset_name,
             data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
             use_auth_token=True if model_args.use_auth_token else None,
         )
         if "validation" not in raw_datasets.keys():
@@ -306,12 +294,14 @@ def main():
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
                 use_auth_token=True if model_args.use_auth_token else None,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
                 use_auth_token=True if model_args.use_auth_token else None,
             )
     else:
@@ -321,16 +311,39 @@ def main():
             data_files["train"] = data_args.train_file
         if data_args.validation_file is not None:
             data_files["validation"] = data_args.validation_file
-        extension = data_args.train_file.split(".")[-1]
+        extension = (
+            data_args.train_file.split(".")[-1]
+            if data_args.train_file is not None
+            else data_args.validation_file.split(".")[-1]
+        )
         if extension == "txt":
             extension = "text"
             dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
+            cache_dir=model_args.cache_dir,
             use_auth_token=True if model_args.use_auth_token else None,
             **dataset_args,
         )
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                **dataset_args,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                **dataset_args,
+            )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
     # endregion
@@ -446,7 +459,7 @@ def group_texts(examples):
         eval_dataset = eval_dataset.select(range(max_eval_samples))
 
     # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
+    for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
         logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
     # endregion
 
@@ -465,44 +478,88 @@ def group_texts(examples):
 
         # region TF Dataset preparation
         num_replicas = training_args.strategy.num_replicas_in_sync
-        data_collator = DefaultDataCollator(return_tensors="tf")
         options = tf.data.Options()
         options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
 
-        tf_train_dataset = train_dataset.to_tf_dataset(
-            # labels are passed as input, as we will use the model's internal loss
-            columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
+        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+        # yourself if you use this method, whereas they are automatically inferred from the model input names when
+        # using model.prepare_tf_dataset()
+        # For more info see the docs:
+        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+        tf_train_dataset = model.prepare_tf_dataset(
+            train_dataset,
             shuffle=True,
             batch_size=num_replicas * training_args.per_device_train_batch_size,
-            collate_fn=data_collator,
-            drop_remainder=True,
         ).with_options(options)
 
-        tf_eval_dataset = eval_dataset.to_tf_dataset(
-            # labels are passed as input, as we will use the model's internal loss
-            columns=[col for col in eval_dataset.features if col != "special_tokens_mask"],
+        tf_eval_dataset = model.prepare_tf_dataset(
+            eval_dataset,
             shuffle=False,
-            batch_size=num_replicas * training_args.per_device_train_batch_size,
-            collate_fn=data_collator,
+            batch_size=num_replicas * training_args.per_device_eval_batch_size,
             drop_remainder=True,
         ).with_options(options)
         # endregion
 
         # region Optimizer and loss
-        batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
+        num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
+        if training_args.warmup_steps > 0:
+            num_warmup_steps = training_args.warmup_steps
+        elif training_args.warmup_ratio > 0:
+            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+        else:
+            num_warmup_steps = 0
+
         # Bias and layernorm weights are automatically excluded from the decay
         optimizer, lr_schedule = create_optimizer(
             init_lr=training_args.learning_rate,
-            num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
-            num_warmup_steps=training_args.warmup_steps,
+            num_train_steps=num_train_steps,
+            num_warmup_steps=num_warmup_steps,
             adam_beta1=training_args.adam_beta1,
             adam_beta2=training_args.adam_beta2,
             adam_epsilon=training_args.adam_epsilon,
             weight_decay_rate=training_args.weight_decay,
+            adam_global_clipnorm=training_args.max_grad_norm,
         )
 
         # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer)
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
+        # endregion
+
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            if data_args.dataset_name is not None:
+                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+            else:
+                push_to_hub_model_id = f"{model_name}-finetuned-clm"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
         # endregion
 
         # region Training and validation
@@ -512,33 +569,45 @@ def group_texts(examples):
         logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
         logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
 
+        # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
+        # to the Hugging Face Hub rather than just pushing the finished model.
+        # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
+
         history = model.fit(
             tf_train_dataset,
             validation_data=tf_eval_dataset,
             epochs=int(training_args.num_train_epochs),
-            steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
-            callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
+            callbacks=callbacks,
         )
+        train_loss = history.history["loss"][-1]
         try:
-            train_perplexity = math.exp(history.history["loss"][-1])
+            train_perplexity = math.exp(train_loss)
         except OverflowError:
             train_perplexity = math.inf
+        logger.info(f"  Final train loss: {train_loss:.3f}")
+        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
+        validation_loss = history.history["val_loss"][-1]
         try:
-            validation_perplexity = math.exp(history.history["val_loss"][-1])
+            validation_perplexity = math.exp(validation_loss)
         except OverflowError:
             validation_perplexity = math.inf
-        logger.info(f"  Final train loss: {history.history['loss'][-1]:.3f}")
-        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
-        logger.info(f"  Final validation loss: {history.history['val_loss'][-1]:.3f}")
+        logger.info(f"  Final validation loss: {validation_loss:.3f}")
         logger.info(f"  Final validation perplexity: {validation_perplexity:.3f}")
-        # endregion
 
         if training_args.output_dir is not None:
-            model.save_pretrained(training_args.output_dir)
+            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+            results_dict = dict()
+            results_dict["train_loss"] = train_loss
+            results_dict["train_perplexity"] = train_perplexity
+            results_dict["eval_loss"] = validation_loss
+            results_dict["eval_perplexity"] = validation_perplexity
+            with open(output_eval_file, "w") as writer:
+                writer.write(json.dumps(results_dict))
+        # endregion
 
-    if training_args.push_to_hub:
-        # You'll probably want to include some of your own metadata here!
-        model.push_to_hub()
+    if training_args.output_dir is not None and not training_args.push_to_hub:
+        # If we're not pushing to hub, at least save a local copy when we're done
+        model.save_pretrained(training_args.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index b421ed8e669c15..43449a093411d4 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -22,9 +22,7 @@
 """
 # You can also adapt this script on your own mlm task. Pointers for this are left as comments.
 
-# TODO Do multi-GPU and TPU tests and make sure the dataset length works as expected
-# TODO Duplicate all changes over to the CLM script
-
+import json
 import logging
 import math
 import os
@@ -50,6 +48,7 @@
     AutoTokenizer,
     DataCollatorForLanguageModeling,
     HfArgumentParser,
+    PushToHubCallback,
     TFAutoModelForMaskedLM,
     TFTrainingArguments,
     create_optimizer,
@@ -217,22 +216,6 @@ def __post_init__(self):
 # endregion
 
 
-# region Helper classes
-class SavePretrainedCallback(tf.keras.callbacks.Callback):
-    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
-    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
-    # that saves the model with this method after each epoch.
-    def __init__(self, output_dir, **kwargs):
-        super().__init__()
-        self.output_dir = output_dir
-
-    def on_epoch_end(self, epoch, logs=None):
-        self.model.save_pretrained(self.output_dir)
-
-
-# endregion
-
-
 def main():
     # region Argument Parsing
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
@@ -492,7 +475,7 @@ def group_texts(examples):
         eval_dataset = eval_dataset.select(range(max_eval_samples))
 
     # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
+    for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
         logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
     # endregion
 
@@ -517,40 +500,88 @@ def group_texts(examples):
         options = tf.data.Options()
         options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
 
-        tf_train_dataset = train_dataset.to_tf_dataset(
-            # labels are passed as input, as we will use the model's internal loss
-            columns=[col for col in train_dataset.features if col != "special_tokens_mask"] + ["labels"],
+        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+        # yourself if you use this method, whereas they are automatically inferred from the model input names when
+        # using model.prepare_tf_dataset()
+        # For more info see the docs:
+        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+        tf_train_dataset = model.prepare_tf_dataset(
+            train_dataset,
             shuffle=True,
             batch_size=num_replicas * training_args.per_device_train_batch_size,
             collate_fn=data_collator,
-            drop_remainder=True,
         ).with_options(options)
 
-        tf_eval_dataset = eval_dataset.to_tf_dataset(
+        tf_eval_dataset = model.prepare_tf_dataset(
+            eval_dataset,
             # labels are passed as input, as we will use the model's internal loss
-            columns=[col for col in eval_dataset.features if col != "special_tokens_mask"] + ["labels"],
             shuffle=False,
-            batch_size=num_replicas * training_args.per_device_train_batch_size,
+            batch_size=num_replicas * training_args.per_device_eval_batch_size,
             collate_fn=data_collator,
             drop_remainder=True,
         ).with_options(options)
         # endregion
 
         # region Optimizer and loss
-        batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
+        num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
+        if training_args.warmup_steps > 0:
+            num_warmup_steps = training_args.warmup_steps
+        elif training_args.warmup_ratio > 0:
+            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+        else:
+            num_warmup_steps = 0
+
         # Bias and layernorm weights are automatically excluded from the decay
         optimizer, lr_schedule = create_optimizer(
             init_lr=training_args.learning_rate,
-            num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
-            num_warmup_steps=training_args.warmup_steps,
+            num_train_steps=num_train_steps,
+            num_warmup_steps=num_warmup_steps,
             adam_beta1=training_args.adam_beta1,
             adam_beta2=training_args.adam_beta2,
             adam_epsilon=training_args.adam_epsilon,
             weight_decay_rate=training_args.weight_decay,
+            adam_global_clipnorm=training_args.max_grad_norm,
         )
 
         # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer)
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True)
+        # endregion
+
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            if data_args.dataset_name is not None:
+                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+            else:
+                push_to_hub_model_id = f"{model_name}-finetuned-mlm"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
         # endregion
 
         # region Training and validation
@@ -560,33 +591,46 @@ def group_texts(examples):
         logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
         logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
 
+        # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
+        # to the Hugging Face Hub rather than just pushing the finished model.
+        # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
+
         history = model.fit(
             tf_train_dataset,
             validation_data=tf_eval_dataset,
             epochs=int(training_args.num_train_epochs),
-            steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
-            callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
+            callbacks=callbacks,
         )
+        train_loss = history.history["loss"][-1]
         try:
-            train_perplexity = math.exp(history.history["loss"][-1])
+            train_perplexity = math.exp(train_loss)
         except OverflowError:
             train_perplexity = math.inf
-        try:
-            validation_perplexity = math.exp(history.history["val_loss"][-1])
-        except OverflowError:
-            validation_perplexity = math.inf
-        logger.warning(f"  Final train loss: {history.history['loss'][-1]:.3f}")
-        logger.warning(f"  Final train perplexity: {train_perplexity:.3f}")
-        logger.warning(f"  Final validation loss: {history.history['val_loss'][-1]:.3f}")
-        logger.warning(f"  Final validation perplexity: {validation_perplexity:.3f}")
-        # endregion
+        logger.info(f"  Final train loss: {train_loss:.3f}")
+        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
+
+    validation_loss = history.history["val_loss"][-1]
+    try:
+        validation_perplexity = math.exp(validation_loss)
+    except OverflowError:
+        validation_perplexity = math.inf
+    logger.info(f"  Final validation loss: {validation_loss:.3f}")
+    logger.info(f"  Final validation perplexity: {validation_perplexity:.3f}")
 
-        if training_args.output_dir is not None:
-            model.save_pretrained(training_args.output_dir)
+    if training_args.output_dir is not None:
+        output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+        results_dict = dict()
+        results_dict["train_loss"] = train_loss
+        results_dict["train_perplexity"] = train_perplexity
+        results_dict["eval_loss"] = validation_loss
+        results_dict["eval_perplexity"] = validation_perplexity
+        with open(output_eval_file, "w") as writer:
+            writer.write(json.dumps(results_dict))
+        # endregion
 
-    if training_args.push_to_hub:
-        # You'll probably want to append some of your own metadata here!
-        model.push_to_hub()
+    if training_args.output_dir is not None and not training_args.push_to_hub:
+        # If we're not pushing to hub, at least save a local copy when we're done
+        model.save_pretrained(training_args.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 6ba35bd0fd2023..2684500d248db9 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -18,6 +18,7 @@
 """
 # You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
 
+import json
 import logging
 import os
 import sys
@@ -38,6 +39,7 @@
     AutoTokenizer,
     DefaultDataCollator,
     HfArgumentParser,
+    PushToHubCallback,
     TFAutoModelForMultipleChoice,
     TFTrainingArguments,
     create_optimizer,
@@ -54,16 +56,6 @@
 
 
 # region Helper classes and functions
-class SavePretrainedCallback(tf.keras.callbacks.Callback):
-    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
-    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
-    # that saves the model with this method after each epoch.
-    def __init__(self, output_dir, **kwargs):
-        super().__init__()
-        self.output_dir = output_dir
-
-    def on_epoch_end(self, epoch, logs=None):
-        self.model.save_pretrained(self.output_dir)
 
 
 @dataclass
@@ -391,7 +383,6 @@ def preprocess_function(examples):
         if "train" not in raw_datasets:
             raise ValueError("--do_train requires a train dataset")
         train_dataset = raw_datasets["train"]
-        non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")]
         if data_args.max_train_samples is not None:
             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
             train_dataset = train_dataset.select(range(max_train_samples))
@@ -407,8 +398,6 @@ def preprocess_function(examples):
         if "validation" not in raw_datasets:
             raise ValueError("--do_eval requires a validation dataset")
         eval_dataset = raw_datasets["validation"]
-        if not training_args.do_train:
-            non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")]
         if data_args.max_eval_samples is not None:
             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
             eval_dataset = eval_dataset.select(range(max_eval_samples))
@@ -444,79 +433,120 @@ def preprocess_function(examples):
         num_replicas = training_args.strategy.num_replicas_in_sync
         total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
         total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
+
         if training_args.do_train:
-            total_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
+            num_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
+            if training_args.warmup_steps > 0:
+                num_warmup_steps = training_args.warmup_steps
+            elif training_args.warmup_ratio > 0:
+                num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+            else:
+                num_warmup_steps = 0
             optimizer, lr_schedule = create_optimizer(
-                init_lr=training_args.learning_rate, num_train_steps=int(total_train_steps), num_warmup_steps=0
+                init_lr=training_args.learning_rate,
+                num_train_steps=num_train_steps,
+                num_warmup_steps=num_warmup_steps,
+                adam_beta1=training_args.adam_beta1,
+                adam_beta2=training_args.adam_beta2,
+                adam_epsilon=training_args.adam_epsilon,
+                weight_decay_rate=training_args.weight_decay,
+                adam_global_clipnorm=training_args.max_grad_norm,
             )
         else:
-            optimizer = "adam"  # Just put anything in here, since we're not using it anyway
-        model.compile(
-            optimizer=optimizer,
-            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-            metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")],
-        )
+            optimizer = None
+        model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla)
+        # endregion
+
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            push_to_hub_model_id = f"{model_name}-finetuned-multiplechoice"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "multiple-choice"}
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
         # endregion
 
         # region Training
+        eval_metrics = None
         if training_args.do_train:
-            dataset_exclude_cols = set(non_label_columns + ["label"])
-            tf_train_dataset = train_dataset.to_tf_dataset(
-                columns=[col for col in train_dataset.column_names if col not in dataset_exclude_cols],
+            dataset_options = tf.data.Options()
+            dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+
+            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+            # yourself if you use this method, whereas they are automatically inferred from the model input names when
+            # using model.prepare_tf_dataset()
+            # For more info see the docs:
+            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+            tf_train_dataset = model.prepare_tf_dataset(
+                train_dataset,
                 shuffle=True,
                 batch_size=total_train_batch_size,
                 collate_fn=data_collator,
-                drop_remainder=True,
-                # `label_cols` is needed for user-defined losses, such as in this example
-                label_cols="label" if "label" in train_dataset.column_names else None,
-            )
+            ).with_options(dataset_options)
 
             if training_args.do_eval:
-                validation_data = eval_dataset.to_tf_dataset(
-                    columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
+                validation_data = model.prepare_tf_dataset(
+                    eval_dataset,
                     shuffle=False,
                     batch_size=total_eval_batch_size,
                     collate_fn=data_collator,
                     drop_remainder=True,
-                    # `label_cols` is needed for user-defined losses, such as in this example
-                    label_cols="label" if "label" in eval_dataset.column_names else None,
-                )
+                ).with_options(dataset_options)
             else:
                 validation_data = None
-            model.fit(
+            history = model.fit(
                 tf_train_dataset,
                 validation_data=validation_data,
                 epochs=int(training_args.num_train_epochs),
-                callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
+                callbacks=callbacks,
             )
+            eval_metrics = {key: val[-1] for key, val in history.history.items()}
         # endregion
 
         # region Evaluation
         if training_args.do_eval and not training_args.do_train:
-            dataset_exclude_cols = set(non_label_columns + ["label"])
+            dataset_options = tf.data.Options()
+            dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
             # Do a standalone evaluation pass
-            tf_eval_dataset = eval_dataset.to_tf_dataset(
-                columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
+            tf_eval_dataset = model.prepare_tf_dataset(
+                eval_dataset,
                 shuffle=False,
                 batch_size=total_eval_batch_size,
                 collate_fn=data_collator,
                 drop_remainder=True,
-                # `label_cols` is needed for user-defined losses, such as in this example
-                label_cols="label" if "label" in eval_dataset.column_names else None,
-            )
-            model.evaluate(tf_eval_dataset)
+            ).with_options(dataset_options)
+            eval_results = model.evaluate(tf_eval_dataset)
+            eval_metrics = {"val_loss": eval_results[0], "val_accuracy": eval_results[1]}
         # endregion
 
+        if eval_metrics is not None and training_args.output_dir is not None:
+            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+            with open(output_eval_file, "w") as writer:
+                writer.write(json.dumps(eval_metrics))
+
         # region Push to hub
-        if training_args.push_to_hub:
-            model.push_to_hub(
-                finetuned_from=model_args.model_name_or_path,
-                tasks="multiple-choice",
-                dataset_tags="swag",
-                dataset_args="regular",
-                dataset="SWAG",
-                language="en",
-            )
+
+        if training_args.output_dir is not None and not training_args.push_to_hub:
+            # If we're not pushing to hub, at least save a local copy when we're done
+            model.save_pretrained(training_args.output_dir)
         # endregion
 
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 91293aefb35f55..7f53a9841509c7 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -18,6 +18,7 @@
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.
 
+import json
 import logging
 import os
 import sys
@@ -33,13 +34,13 @@
 from transformers import (
     AutoConfig,
     AutoTokenizer,
-    DataCollatorWithPadding,
-    DefaultDataCollator,
     EvalPrediction,
     HfArgumentParser,
     PreTrainedTokenizerFast,
+    PushToHubCallback,
     TFAutoModelForQuestionAnswering,
     TFTrainingArguments,
+    create_optimizer,
     set_seed,
 )
 from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
@@ -609,7 +610,12 @@ def compute_metrics(p: EvalPrediction):
     # endregion
 
     with training_args.strategy.scope():
-        # region Load model
+
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        num_replicas = training_args.strategy.num_replicas_in_sync
+
+        # region Load model and prepare datasets
         if checkpoint is None:
             model_path = model_args.model_name_or_path
         else:
@@ -621,71 +627,163 @@ def compute_metrics(p: EvalPrediction):
             revision=model_args.model_revision,
             use_auth_token=True if model_args.use_auth_token else None,
         )
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=training_args.learning_rate,
-            beta_1=training_args.adam_beta1,
-            beta_2=training_args.adam_beta2,
-            epsilon=training_args.adam_epsilon,
-            clipnorm=training_args.max_grad_norm,
-        )
+        if training_args.do_train:
 
-        # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer)
-        # endregion
+            training_dataset = model.prepare_tf_dataset(
+                processed_datasets["train"],
+                shuffle=True,
+                batch_size=training_args.per_device_train_batch_size * num_replicas,
+                tokenizer=tokenizer,
+            )
+
+            training_dataset = training_dataset.with_options(dataset_options)
+
+            num_train_steps = len(training_dataset) * training_args.num_train_epochs
+            if training_args.warmup_steps > 0:
+                num_warmup_steps = training_args.warmup_steps
+            elif training_args.warmup_ratio > 0:
+                num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+            else:
+                num_warmup_steps = 0
+
+            optimizer, schedule = create_optimizer(
+                init_lr=training_args.learning_rate,
+                num_train_steps=len(training_dataset) * training_args.num_train_epochs,
+                num_warmup_steps=num_warmup_steps,
+                adam_beta1=training_args.adam_beta1,
+                adam_beta2=training_args.adam_beta2,
+                adam_epsilon=training_args.adam_epsilon,
+                weight_decay_rate=training_args.weight_decay,
+                adam_global_clipnorm=training_args.max_grad_norm,
+            )
+
+            # no user-specified loss = will use the model internal loss
+            model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
 
-        # region Training
-        if padding:
-            data_collator = DefaultDataCollator(return_tensors="tf")
         else:
-            data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
-        tensor_keys = ["attention_mask", "input_ids"]
-        label_keys = ["start_positions", "end_positions"]
+            model.compile(optimizer=None, jit_compile=training_args.xla, metrics=["accuracy"])
+            training_dataset = None
 
-        if training_args.do_train:
-            # Make a tf.data.Dataset for this
-            training_dataset = processed_datasets["train"].to_tf_dataset(
-                # labels are passed as input, as we will use the model's internal loss
-                columns=tensor_keys + label_keys,
-                shuffle=True,
-                batch_size=training_args.per_device_train_batch_size,
-                collate_fn=data_collator,
-                drop_remainder=True,
+        if training_args.do_eval:
+            eval_dataset = model.prepare_tf_dataset(
+                processed_datasets["validation"],
+                shuffle=False,
+                batch_size=training_args.per_device_train_batch_size * num_replicas,
+                tokenizer=tokenizer,
+            )
+            eval_dataset = eval_dataset.with_options(dataset_options)
+        else:
+            eval_dataset = None
+
+        if training_args.do_predict:
+            predict_dataset = model.prepare_tf_dataset(
+                processed_datasets["test"],
+                shuffle=False,
+                batch_size=training_args.per_device_eval_batch_size * num_replicas,
+                tokenizer=tokenizer,
             )
-            model.fit(training_dataset, epochs=int(training_args.num_train_epochs))
+            predict_dataset = predict_dataset.with_options(dataset_options)
+        else:
+            predict_dataset = None
+
+        # endregion
+
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            if data_args.dataset_name is not None:
+                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+            else:
+                push_to_hub_model_id = f"{model_name}-finetuned-question-answering"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
         # endregion
 
-        # region Evaluation
+        # region Training and Evaluation
+
+        if training_args.do_train:
+            # Note that the validation and test datasets have been processed in a different way to the
+            # training datasets in this example, and so they don't have the same label structure.
+            # As such, we don't pass them directly to Keras, but instead get model predictions to evaluate
+            # after training.
+            model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
+
         if training_args.do_eval:
             logger.info("*** Evaluation ***")
-            eval_inputs = {
-                "input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(),
-                "attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(),
-            }
-            eval_predictions = model.predict(eval_inputs)
+
+            # In this example, we compute advanced metrics at the end of training, but
+            # if you'd like to compute metrics every epoch that are too complex to be written as
+            # standard Keras metrics, you can use our KerasMetricCallback. See
+            # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
+
+            eval_predictions = model.predict(eval_dataset)
+            if isinstance(eval_predictions.start_logits, tf.RaggedTensor):
+                # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
+                # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
+                # the highest probability in a sample. Instead, we use a large negative value, which ensures that the
+                # padding positions are correctly masked.
+                eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy()
+                eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy()
+            else:
+                eval_start_logits = eval_predictions.start_logits
+                eval_end_logits = eval_predictions.end_logits
 
             post_processed_eval = post_processing_function(
                 datasets["validation"],
                 processed_datasets["validation"],
-                (eval_predictions.start_logits, eval_predictions.end_logits),
+                (eval_start_logits, eval_end_logits),
             )
             metrics = compute_metrics(post_processed_eval)
             logging.info("Evaluation metrics:")
             for metric, value in metrics.items():
                 logging.info(f"{metric}: {value:.3f}")
+            if training_args.output_dir is not None:
+                output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+                with open(output_eval_file, "w") as writer:
+                    writer.write(json.dumps(metrics))
         # endregion
 
         # region Prediction
         if training_args.do_predict:
             logger.info("*** Predict ***")
-            predict_inputs = {
-                "input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(),
-                "attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(),
-            }
-            test_predictions = model.predict(predict_inputs)
+
+            test_predictions = model.predict(predict_dataset)
+            if isinstance(test_predictions.start_logits, tf.RaggedTensor):
+                # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
+                # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
+                # the highest probability in a sample. Instead, we use a large negative value, which ensures that the
+                # padding positions are correctly masked.
+                test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy()
+                test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy()
+            else:
+                test_start_logits = test_predictions.start_logits
+                test_end_logits = test_predictions.end_logits
             post_processed_test = post_processing_function(
                 datasets["test"],
                 processed_datasets["test"],
-                (test_predictions.start_logits, test_predictions.end_logits),
+                (test_start_logits, test_end_logits),
             )
             metrics = compute_metrics(post_processed_test)
 
@@ -694,8 +792,9 @@ def compute_metrics(p: EvalPrediction):
                 logging.info(f"{metric}: {value:.3f}")
         # endregion
 
-    if training_args.push_to_hub:
-        model.push_to_hub()
+    if training_args.output_dir is not None and not training_args.push_to_hub:
+        # If we're not pushing to hub, at least save a local copy when we're done
+        model.save_pretrained(training_args.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 6d4cf99e6782f8..2cf6bdba604b8d 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -18,11 +18,11 @@
 """
 # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
 
+import json
 import logging
 import os
 import sys
 from dataclasses import dataclass, field
-from functools import partial
 from typing import Optional
 
 import datasets
@@ -30,7 +30,6 @@
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
-from tqdm import tqdm
 
 import evaluate
 import transformers
@@ -38,7 +37,10 @@
 from transformers import (
     AutoConfig,
     AutoTokenizer,
+    DataCollatorForSeq2Seq,
     HfArgumentParser,
+    KerasMetricCallback,
+    PushToHubCallback,
     TFAutoModelForSeq2SeqLM,
     TFTrainingArguments,
     create_optimizer,
@@ -253,7 +255,6 @@ def __post_init__(self):
 
 # endregion
 
-
 # region Dataset name mappings
 summarization_name_mapping = {
     "amazon_reviews_multi": ("review_body", "review_title"),
@@ -272,71 +273,6 @@ def __post_init__(self):
 # endregion
 
 
-# region Data generator
-def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
-    if shuffle:
-        sample_ordering = np.random.permutation(len(dataset))
-    else:
-        sample_ordering = np.arange(len(dataset))
-    for sample_idx in sample_ordering:
-        example = dataset[int(sample_idx)]
-        # Handle dicts with proper padding and conversion to tensor.
-        example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
-        example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
-        if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
-            decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
-                labels=tf.expand_dims(example["labels"], 0)
-            )
-            example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
-        yield example, example["labels"]  # TF needs some kind of labels, even if we don't use them
-    return
-
-
-# endregion
-
-
-# region Helper functions
-def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
-    if dataset is None:
-        return None
-    train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
-    train_signature = {
-        feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
-        for feature in dataset.features
-        if feature != "special_tokens_mask"
-    }
-    if (
-        model is not None
-        and "decoder_input_ids" not in train_signature
-        and hasattr(model, "prepare_decoder_input_ids_from_labels")
-    ):
-        train_signature["decoder_input_ids"] = train_signature["labels"]
-    # This may need to be changed depending on your particular model or tokenizer!
-    padding_values = {
-        key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
-        for key in train_signature.keys()
-    }
-    padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
-    train_signature["labels"] = train_signature["input_ids"]
-    train_signature = (train_signature, train_signature["labels"])
-    options = tf.data.Options()
-    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-    tf_dataset = (
-        tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
-        .with_options(options)
-        .padded_batch(
-            batch_size=total_batch_size,
-            drop_remainder=True,
-            padding_values=(padding_values, np.array(-100, dtype=np.int32)),
-        )
-        .repeat(int(num_epochs))
-    )
-    return tf_dataset
-
-
-# endregion
-
-
 def main():
     # region Argument parsing
     # See all possible arguments in src/transformers/training_args.py
@@ -587,59 +523,148 @@ def postprocess_text(preds, labels):
         if model.config.decoder_start_token_id is None:
             raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
 
+        label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+        data_collator = DataCollatorForSeq2Seq(
+            tokenizer,
+            model=model,
+            label_pad_token_id=label_pad_token_id,
+            pad_to_multiple_of=128,  # Reduce the number of unique shapes for XLA, especially for generation
+            return_tensors="tf",
+        )
+
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+
         num_replicas = training_args.strategy.num_replicas_in_sync
         total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
         total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
-        tf_train_dataset = dataset_to_tf(
+
+        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+        # yourself if you use this method, whereas they are automatically inferred from the model input names when
+        # using model.prepare_tf_dataset()
+        # For more info see the docs:
+        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+        tf_train_dataset = model.prepare_tf_dataset(
             train_dataset,
-            model,
-            tokenizer,
-            total_batch_size=total_train_batch_size,
-            num_epochs=training_args.num_train_epochs,
+            collate_fn=data_collator,
+            batch_size=total_train_batch_size,
             shuffle=True,
-        )
-        tf_eval_dataset = dataset_to_tf(
+        ).with_options(dataset_options)
+        tf_eval_dataset = model.prepare_tf_dataset(
             eval_dataset,
-            model,
-            tokenizer,
-            total_eval_batch_size,
-            num_epochs=1,
+            collate_fn=data_collator,
+            batch_size=total_eval_batch_size,
             shuffle=False,
-        )
+        ).with_options(dataset_options)
         # endregion
 
         # region Optimizer, loss and LR scheduling
-        # Scheduler and math around the number of training steps.
-        num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size
-        num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
-        optimizer, lr_schedule = create_optimizer(
-            init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0
-        )
-
-        def masked_sparse_categorical_crossentropy(y_true, y_pred):
-            # We clip the negative labels to 0 to avoid NaNs appearing in the output and
-            # fouling up everything that comes afterwards. The loss values corresponding to clipped values
-            # will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
-            # 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
-            # event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
-            # More pragmatically, consider redesigning your tokenizer.
-            losses = tf.keras.losses.sparse_categorical_crossentropy(
-                tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
+        num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
+        if training_args.warmup_steps > 0:
+            num_warmup_steps = training_args.warmup_steps
+        elif training_args.warmup_ratio > 0:
+            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+        else:
+            num_warmup_steps = 0
+        if training_args.do_train:
+            optimizer, lr_schedule = create_optimizer(
+                init_lr=training_args.learning_rate,
+                num_train_steps=num_train_steps,
+                num_warmup_steps=num_warmup_steps,
+                adam_beta1=training_args.adam_beta1,
+                adam_beta2=training_args.adam_beta2,
+                adam_epsilon=training_args.adam_epsilon,
+                weight_decay_rate=training_args.weight_decay,
+                adam_global_clipnorm=training_args.max_grad_norm,
             )
-            # Compute the per-sample loss only over the unmasked tokens
-            losses = tf.ragged.boolean_mask(losses, y_true != -100)
-            losses = tf.reduce_mean(losses, axis=-1)
-            return losses
+        else:
+            optimizer = None
+
+        # endregion
 
+        # region Metric and KerasMetricCallback
+        if training_args.do_eval:
+            metric = evaluate.load("rouge")
+
+            if data_args.val_max_target_length is None:
+                data_args.val_max_target_length = data_args.max_target_length
+
+            gen_kwargs = {
+                "max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
+                "num_beams": data_args.num_beams,
+                "no_repeat_ngram_size": 0,  # Not supported under XLA right now, and some models set it by default
+            }
+
+            def compute_metrics(preds):
+                predictions, labels = preds
+                if isinstance(predictions, tuple):
+                    predictions = predictions[0]
+                decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+                decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+                decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+                metrics = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+                # Only print the mid f-measures, but there are a lot of other statistics in there too!
+                metrics = {key: round(val.mid.fmeasure * 100, 4) for key, val in metrics.items()}
+                return metrics
+
+            # The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
+            # to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
+            # useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
+            # For more information, see the docs at
+            # https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
+
+            metric_callback = KerasMetricCallback(
+                metric_fn=compute_metrics,
+                eval_dataset=tf_eval_dataset,
+                predict_with_generate=True,
+                use_xla_generation=True,
+                generate_kwargs=gen_kwargs,
+            )
+            callbacks = [metric_callback]
+        else:
+            callbacks = []
         # endregion
 
-        # region Metric
-        metric = evaluate.load("rouge")
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            if data_args.dataset_name is not None:
+                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+            else:
+                push_to_hub_model_id = f"{model_name}-finetuned-summarization"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        if training_args.push_to_hub:
+            # Because this training can be quite long, we save once per epoch.
+            callbacks.append(
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            )
         # endregion
 
         # region Training
-        model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
-
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
+        eval_metrics = None
         if training_args.do_train:
             logger.info("***** Running training *****")
             logger.info(f"  Num examples = {len(train_dataset)}")
@@ -648,28 +673,29 @@ def masked_sparse_categorical_crossentropy(y_true, y_pred):
             logger.info(f"  Total train batch size = {total_train_batch_size}")
             logger.info(f"  Total optimization steps = {num_train_steps}")
 
-            model.fit(
-                tf_train_dataset,
-                epochs=int(training_args.num_train_epochs),
-                steps_per_epoch=num_update_steps_per_epoch,
-            )
+            if training_args.xla and not data_args.pad_to_max_length:
+                logger.warning(
+                    "XLA training may be slow at first when --pad_to_max_length is not set "
+                    "until all possible shapes have been compiled."
+                )
+            history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
+            eval_metrics = {key: val[-1] for key, val in history.history.items()}
         # endregion
 
         # region Validation
-        if data_args.val_max_target_length is None:
-            data_args.val_max_target_length = data_args.max_target_length
 
-        gen_kwargs = {
-            "max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
-            "num_beams": data_args.num_beams,
-        }
-        if training_args.do_eval:
+        if training_args.do_eval and not training_args.do_train:
+            # Do a standalone evaluation run
             logger.info("Evaluation...")
-            for batch, labels in tqdm(
-                tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
-            ):
+
+            # Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
+            @tf.function(jit_compile=True)
+            def generate(**kwargs):
+                return model.generate(**kwargs)
+
+            for batch, labels in tf_eval_dataset:
                 batch.update(gen_kwargs)
-                generated_tokens = model.generate(**batch)
+                generated_tokens = generate(**batch)
                 if isinstance(generated_tokens, tuple):
                     generated_tokens = generated_tokens[0]
                 decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
@@ -679,13 +705,19 @@ def masked_sparse_categorical_crossentropy(y_true, y_pred):
 
                 metric.add_batch(predictions=decoded_preds, references=decoded_labels)
 
-            result = metric.compute(use_stemmer=True)
-            result = {k: round(v * 100, 4) for k, v in result.items()}
+            eval_metrics = metric.compute(use_stemmer=True)
 
+            result = {key: round(val.mid.fmeasure * 100, 4) for key, val in eval_metrics.items()}
             logger.info(result)
         # endregion
 
-        if training_args.output_dir is not None:
+        if training_args.output_dir is not None and eval_metrics is not None:
+            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+            with open(output_eval_file, "w") as writer:
+                writer.write(json.dumps(eval_metrics))
+
+        if training_args.output_dir is not None and not training_args.push_to_hub:
+            # If we're not pushing to hub, at least save a local copy when we're done
             model.save_pretrained(training_args.output_dir)
 
 
diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py
new file mode 100644
index 00000000000000..9b692ce80cbdd6
--- /dev/null
+++ b/examples/tensorflow/test_tensorflow_examples.py
@@ -0,0 +1,295 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import json
+import logging
+import os
+import sys
+from unittest import skip
+from unittest.mock import patch
+
+import tensorflow as tf
+
+from transformers.testing_utils import TestCasePlus, get_gpu_count, slow
+
+
+SRC_DIRS = [
+    os.path.join(os.path.dirname(__file__), dirname)
+    for dirname in [
+        "text-generation",
+        "text-classification",
+        "token-classification",
+        "language-modeling",
+        "multiple-choice",
+        "question-answering",
+        "summarization",
+        "translation",
+    ]
+]
+sys.path.extend(SRC_DIRS)
+
+
+if SRC_DIRS is not None:
+    import run_clm
+    import run_mlm
+    import run_ner
+    import run_qa as run_squad
+    import run_summarization
+    import run_swag
+    import run_text_classification
+    import run_translation
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+
+def get_setup_file():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f")
+    args = parser.parse_args()
+    return args.f
+
+
+def get_results(output_dir):
+    results = {}
+    path = os.path.join(output_dir, "all_results.json")
+    if os.path.exists(path):
+        with open(path, "r") as f:
+            results = json.load(f)
+    else:
+        raise ValueError(f"can't find {path}")
+    return results
+
+
+def is_cuda_available():
+    return bool(tf.config.list_physical_devices("GPU"))
+
+
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class ExamplesTests(TestCasePlus):
+    @skip("Skipping until shape inference for to_tf_dataset PR is merged.")
+    def test_run_text_classification(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_text_classification.py
+            --model_name_or_path distilbert-base-uncased
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
+            --validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
+            --do_train
+            --do_eval
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --learning_rate=1e-4
+            --max_steps=10
+            --warmup_steps=2
+            --seed=42
+            --max_seq_length=128
+            """.split()
+
+        if is_cuda_available():
+            testargs.append("--fp16")
+
+        with patch.object(sys, "argv", testargs):
+            run_text_classification.main()
+            # Reset the mixed precision policy so we don't break other tests
+            tf.keras.mixed_precision.set_global_policy("float32")
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["eval_accuracy"], 0.75)
+
+    def test_run_clm(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_clm.py
+            --model_name_or_path distilgpt2
+            --train_file ./tests/fixtures/sample_text.txt
+            --validation_file ./tests/fixtures/sample_text.txt
+            --do_train
+            --do_eval
+            --block_size 128
+            --per_device_train_batch_size 2
+            --per_device_eval_batch_size 1
+            --num_train_epochs 2
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            """.split()
+
+        if len(tf.config.list_physical_devices("GPU")) > 1:
+            # Skipping because there are not enough batches to train the model + would need a drop_last to work.
+            return
+
+        with patch.object(sys, "argv", testargs):
+            run_clm.main()
+            result = get_results(tmp_dir)
+            self.assertLess(result["eval_perplexity"], 100)
+
+    def test_run_mlm(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_mlm.py
+            --model_name_or_path distilroberta-base
+            --train_file ./tests/fixtures/sample_text.txt
+            --validation_file ./tests/fixtures/sample_text.txt
+            --max_seq_length 64
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --do_train
+            --do_eval
+            --prediction_loss_only
+            --num_train_epochs=1
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_mlm.main()
+            result = get_results(tmp_dir)
+            self.assertLess(result["eval_perplexity"], 42)
+
+    def test_run_ner(self):
+        # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
+        epochs = 7 if get_gpu_count() > 1 else 2
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_ner.py
+            --model_name_or_path bert-base-uncased
+            --train_file tests/fixtures/tests_samples/conll/sample.json
+            --validation_file tests/fixtures/tests_samples/conll/sample.json
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --do_train
+            --do_eval
+            --warmup_steps=2
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=2
+            --num_train_epochs={epochs}
+            --seed 7
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_ner.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["accuracy"], 0.75)
+
+    def test_run_squad(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_qa.py
+            --model_name_or_path bert-base-uncased
+            --version_2_with_negative
+            --train_file tests/fixtures/tests_samples/SQUAD/sample.json
+            --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --max_steps=10
+            --warmup_steps=2
+            --do_train
+            --do_eval
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_squad.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["f1"], 30)
+            self.assertGreaterEqual(result["exact"], 30)
+
+    def test_run_swag(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_swag.py
+            --model_name_or_path bert-base-uncased
+            --train_file tests/fixtures/tests_samples/swag/sample.json
+            --validation_file tests/fixtures/tests_samples/swag/sample.json
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --max_steps=20
+            --warmup_steps=2
+            --do_train
+            --do_eval
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_swag.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["val_accuracy"], 0.8)
+
+    @slow
+    def test_run_summarization(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_summarization.py
+            --model_name_or_path t5-small
+            --train_file tests/fixtures/tests_samples/xsum/sample.json
+            --validation_file tests/fixtures/tests_samples/xsum/sample.json
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --max_steps=50
+            --warmup_steps=8
+            --do_train
+            --do_eval
+            --learning_rate=2e-4
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_summarization.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["rouge1"], 10)
+            self.assertGreaterEqual(result["rouge2"], 2)
+            self.assertGreaterEqual(result["rougeL"], 7)
+            self.assertGreaterEqual(result["rougeLsum"], 7)
+
+    @slow
+    def test_run_translation(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_translation.py
+            --model_name_or_path Rocketknight1/student_marian_en_ro_6_1
+            --source_lang en
+            --target_lang ro
+            --train_file tests/fixtures/tests_samples/wmt16/sample.json
+            --validation_file tests/fixtures/tests_samples/wmt16/sample.json
+            --output_dir {tmp_dir}
+            --overwrite_output_dir
+            --warmup_steps=8
+            --do_train
+            --do_eval
+            --learning_rate=3e-3
+            --num_train_epochs 12
+            --per_device_train_batch_size=2
+            --per_device_eval_batch_size=1
+            --source_lang en_XX
+            --target_lang ro_RO
+        """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_translation.main()
+            result = get_results(tmp_dir)
+            self.assertGreaterEqual(result["bleu"], 30)
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index 9fb0b3f8e43482..d5a6b096b3467e 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -16,6 +16,7 @@
 """ Finetuning the library models for sequence classification on GLUE."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
+import json
 import logging
 import os
 import sys
@@ -35,32 +36,16 @@
     DefaultDataCollator,
     HfArgumentParser,
     PretrainedConfig,
+    PushToHubCallback,
     TFAutoModelForSequenceClassification,
     TFTrainingArguments,
+    create_optimizer,
     set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version, send_example_telemetry
 
 
-# region Helper functions
-
-
-class SavePretrainedCallback(tf.keras.callbacks.Callback):
-    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
-    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
-    # that saves the model with this method after each epoch.
-    def __init__(self, output_dir, **kwargs):
-        super().__init__()
-        self.output_dir = output_dir
-
-    def on_epoch_end(self, epoch, logs=None):
-        self.model.save_pretrained(self.output_dir)
-
-
-# endregion
-
-
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.22.0.dev0")
 
@@ -312,7 +297,6 @@ def main():
 
     # region Dataset preprocessing
     sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
-    non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
 
     # Padding strategy
     if data_args.pad_to_max_length:
@@ -394,24 +378,11 @@ def compute_metrics(preds, label_ids):
         )
         # endregion
 
-        # region Optimizer, loss and compilation
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=training_args.learning_rate,
-            beta_1=training_args.adam_beta1,
-            beta_2=training_args.adam_beta2,
-            epsilon=training_args.adam_epsilon,
-            clipnorm=training_args.max_grad_norm,
-        )
-        if is_regression:
-            loss_fn = tf.keras.losses.MeanSquaredError()
-            metrics = []
-        else:
-            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-            metrics = ["accuracy"]
-        model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
-        # endregion
-
         # region Convert data to a tf.data.Dataset
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        num_replicas = training_args.strategy.num_replicas_in_sync
+
         tf_data = dict()
         max_samples = {
             "train": data_args.max_train_samples,
@@ -428,31 +399,89 @@ def compute_metrics(preds, label_ids):
                 assert "label" in datasets[key].features, f"Missing labels from {key} data!"
             if key == "train":
                 shuffle = True
-                batch_size = training_args.per_device_train_batch_size
-                drop_remainder = True  # Saves us worrying about scaling gradients for the last batch
+                batch_size = training_args.per_device_train_batch_size * num_replicas
             else:
                 shuffle = False
-                batch_size = training_args.per_device_eval_batch_size
-                drop_remainder = False
+                batch_size = training_args.per_device_eval_batch_size * num_replicas
             samples_limit = max_samples[key]
             dataset = datasets[key]
             if samples_limit is not None:
                 dataset = dataset.select(range(samples_limit))
-            data = dataset.to_tf_dataset(
-                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
+
+            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+            # yourself if you use this method, whereas they are automatically inferred from the model input names when
+            # using model.prepare_tf_dataset()
+            # For more info see the docs:
+            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+            data = model.prepare_tf_dataset(
+                dataset,
                 shuffle=shuffle,
                 batch_size=batch_size,
                 collate_fn=data_collator,
-                drop_remainder=drop_remainder,
-                # `label_cols` is needed for user-defined losses, such as in this example
-                label_cols="label" if "label" in dataset.column_names else None,
+                tokenizer=tokenizer,
             )
+            data = data.with_options(dataset_options)
             tf_data[key] = data
         # endregion
 
+        # region Optimizer, loss and compilation
+        if training_args.do_train:
+            num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs
+            if training_args.warmup_steps > 0:
+                num_warmup_steps = training_args.warmup_steps
+            elif training_args.warmup_ratio > 0:
+                num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+            else:
+                num_warmup_steps = 0
+
+            optimizer, schedule = create_optimizer(
+                init_lr=training_args.learning_rate,
+                num_train_steps=num_train_steps,
+                num_warmup_steps=num_warmup_steps,
+                adam_beta1=training_args.adam_beta1,
+                adam_beta2=training_args.adam_beta2,
+                adam_epsilon=training_args.adam_epsilon,
+                weight_decay_rate=training_args.weight_decay,
+                adam_global_clipnorm=training_args.max_grad_norm,
+            )
+        else:
+            optimizer = "adam"  # Just write anything because we won't be using it
+        if is_regression:
+            metrics = []
+        else:
+            metrics = ["accuracy"]
+        model.compile(optimizer=optimizer, metrics=metrics, jit_compile=training_args.xla)
+        # endregion
+
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            push_to_hub_model_id = f"{model_name}-finetuned-glue"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
+        model_card_kwargs["task_name"] = data_args.task_name
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
+        # endregion
+
         # region Training and validation
         if training_args.do_train:
-            callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
             if training_args.do_eval and not data_args.task_name == "mnli":
                 # Do both evaluation and training in the Keras fit loop, unless the task is MNLI
                 # because MNLI has two validation sets
@@ -472,6 +501,12 @@ def compute_metrics(preds, label_ids):
             # We normally do validation as part of the Keras fit loop, but we run it independently
             # if there was no fit() step (because we didn't train the model) or if the task is MNLI,
             # because MNLI has a separate validation-mismatched validation set
+
+            # In this example, we compute advanced metrics only at the end of training, and only compute
+            # loss and accuracy on the validation set each epoch, but
+            # if you'd like to compute metrics every epoch that are too complex to be written as
+            # standard Keras metrics, you can use our KerasMetricCallback. See
+            # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
             logger.info("*** Evaluate ***")
 
             # Loop to handle MNLI double evaluation (matched, mis-matched)
@@ -489,6 +524,10 @@ def compute_metrics(preds, label_ids):
                 eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"])
                 print(f"Evaluation metrics ({task}):")
                 print(eval_metrics)
+                if training_args.output_dir is not None:
+                    output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+                    with open(output_eval_file, "w") as writer:
+                        writer.write(json.dumps(eval_metrics))
 
         # endregion
 
@@ -538,6 +577,10 @@ def compute_metrics(preds, label_ids):
                             writer.write(f"{index}\t{item}\n")
         # endregion
 
+        if training_args.output_dir is not None and not training_args.push_to_hub:
+            # If we're not pushing to hub, at least save a local copy when we're done
+            model.save_pretrained(training_args.output_dir)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index b5d19032971c5b..0cf1972e937fb8 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -16,6 +16,7 @@
 """ Fine-tuning the library models for sequence classification."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
+import json
 import logging
 import os
 import sys
@@ -29,12 +30,12 @@
 from transformers import (
     AutoConfig,
     AutoTokenizer,
-    DataCollatorWithPadding,
-    DefaultDataCollator,
     HfArgumentParser,
     PretrainedConfig,
+    PushToHubCallback,
     TFAutoModelForSequenceClassification,
     TFTrainingArguments,
+    create_optimizer,
     set_seed,
 )
 from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, send_example_telemetry
@@ -383,10 +384,6 @@ def preprocess_function(examples):
 
     datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
 
-    if data_args.pad_to_max_length:
-        data_collator = DefaultDataCollator(return_tensors="tf")
-    else:
-        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
     # endregion
 
     with training_args.strategy.scope():
@@ -409,24 +406,10 @@ def preprocess_function(examples):
         )
         # endregion
 
-        # region Optimizer, loss and compilation
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=training_args.learning_rate,
-            beta_1=training_args.adam_beta1,
-            beta_2=training_args.adam_beta2,
-            epsilon=training_args.adam_epsilon,
-            clipnorm=training_args.max_grad_norm,
-        )
-        if is_regression:
-            loss_fn = tf.keras.losses.MeanSquaredError()
-            metrics = []
-        else:
-            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-            metrics = ["accuracy"]
-        model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
-        # endregion
-
         # region Convert data to a tf.data.Dataset
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        num_replicas = training_args.strategy.num_replicas_in_sync
 
         tf_data = dict()
         max_samples = {
@@ -438,50 +421,121 @@ def preprocess_function(examples):
             if key not in datasets:
                 tf_data[key] = None
                 continue
+            if (
+                (key == "train" and not training_args.do_train)
+                or (key == "validation" and not training_args.do_eval)
+                or (key == "test" and not training_args.do_predict)
+            ):
+                tf_data[key] = None
+                continue
             if key in ("train", "validation"):
                 assert "label" in datasets[key].features, f"Missing labels from {key} data!"
             if key == "train":
                 shuffle = True
-                batch_size = training_args.per_device_train_batch_size
-                drop_remainder = True  # Saves us worrying about scaling gradients for the last batch
+                batch_size = training_args.per_device_train_batch_size * num_replicas
             else:
                 shuffle = False
-                batch_size = training_args.per_device_eval_batch_size
-                drop_remainder = False
+                batch_size = training_args.per_device_eval_batch_size * num_replicas
             samples_limit = max_samples[key]
             dataset = datasets[key]
             if samples_limit is not None:
                 dataset = dataset.select(range(samples_limit))
-            data = dataset.to_tf_dataset(
-                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
+
+            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+            # yourself if you use this method, whereas they are automatically inferred from the model input names when
+            # using model.prepare_tf_dataset()
+            # For more info see the docs:
+            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+            data = model.prepare_tf_dataset(
+                dataset,
                 shuffle=shuffle,
                 batch_size=batch_size,
-                collate_fn=data_collator,
-                drop_remainder=drop_remainder,
-                # `label_cols` is needed for user-defined losses, such as in this example
-                label_cols="label" if "label" in dataset.column_names else None,
+                tokenizer=tokenizer,
             )
+            data = data.with_options(dataset_options)
             tf_data[key] = data
         # endregion
 
+        # region Optimizer, loss and compilation
+
+        if training_args.do_train:
+            num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs
+            if training_args.warmup_steps > 0:
+                num_warmup_steps = training_args.warmup_steps
+            elif training_args.warmup_ratio > 0:
+                num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+            else:
+                num_warmup_steps = 0
+
+            optimizer, schedule = create_optimizer(
+                init_lr=training_args.learning_rate,
+                num_train_steps=num_train_steps,
+                num_warmup_steps=num_warmup_steps,
+                adam_beta1=training_args.adam_beta1,
+                adam_beta2=training_args.adam_beta2,
+                adam_epsilon=training_args.adam_epsilon,
+                weight_decay_rate=training_args.weight_decay,
+                adam_global_clipnorm=training_args.max_grad_norm,
+            )
+        else:
+            optimizer = None
+        if is_regression:
+            metrics = []
+        else:
+            metrics = ["accuracy"]
+        model.compile(optimizer=optimizer, metrics=metrics)
+        # endregion
+
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            push_to_hub_model_id = f"{model_name}-finetuned-text-classification"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
+        # endregion
+
         # region Training and validation
         if tf_data["train"] is not None:
-            callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
             model.fit(
                 tf_data["train"],
                 validation_data=tf_data["validation"],
                 epochs=int(training_args.num_train_epochs),
                 callbacks=callbacks,
             )
-        elif tf_data["validation"] is not None:
-            # If there's a validation dataset but no training set, just evaluate the metrics
+        if tf_data["validation"] is not None:
             logger.info("Computing metrics on validation data...")
             if is_regression:
                 loss = model.evaluate(tf_data["validation"])
-                logger.info(f"Loss: {loss:.5f}")
+                logger.info(f"Eval loss: {loss:.5f}")
             else:
                 loss, accuracy = model.evaluate(tf_data["validation"])
-                logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%")
+                logger.info(f"Eval loss: {loss:.5f}, Eval accuracy: {accuracy * 100:.4f}%")
+            if training_args.output_dir is not None:
+                output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+                eval_dict = {"eval_loss": loss}
+                if not is_regression:
+                    eval_dict["eval_accuracy"] = accuracy
+                with open(output_eval_file, "w") as writer:
+                    writer.write(json.dumps(eval_dict))
         # endregion
 
         # region Prediction
@@ -501,14 +555,9 @@ def preprocess_function(examples):
             logger.info(f"Wrote predictions to {output_test_file}!")
         # endregion
 
-    # region Prediction losses
-    # This section is outside the scope() because it's very quick to compute, but behaves badly inside it
-    if "test" in datasets and "label" in datasets["test"].features:
-        print("Computing prediction loss on test labels...")
-        labels = datasets["test"]["label"]
-        loss = float(loss_fn(labels, predictions).numpy())
-        print(f"Test loss: {loss:.4f}")
-    # endregion
+        if training_args.output_dir is not None and not training_args.push_to_hub:
+            # If we're not pushing to hub, at least save a local copy when we're done
+            model.save_pretrained(training_args.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index caa47e115a4bfa..8eb9aef92b7bd6 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -18,14 +18,14 @@
 without using a Trainer.
 """
 
+import json
 import logging
+import os
 import random
 from dataclasses import dataclass, field
-from functools import partial
 from typing import Optional
 
 import datasets
-import numpy as np
 import tensorflow as tf
 from datasets import ClassLabel, load_dataset
 
@@ -33,10 +33,11 @@
 import transformers
 from transformers import (
     CONFIG_MAPPING,
-    MODEL_MAPPING,
     AutoConfig,
     AutoTokenizer,
+    DataCollatorForTokenClassification,
     HfArgumentParser,
+    PushToHubCallback,
     TFAutoModelForTokenClassification,
     TFTrainingArguments,
     create_optimizer,
@@ -48,11 +49,7 @@
 
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.StreamHandler())
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
-
-# You should update this to your particular problem to have better documentation of `model_type`
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt")
 
 
 # region Command-line arguments
@@ -195,61 +192,6 @@ def __post_init__(self):
 # endregion
 
 
-# region Data generator
-def sample_generator(dataset, tokenizer, shuffle, pad_to_multiple_of=None):
-    # Trim off the last partial batch if present
-    if shuffle:
-        sample_ordering = np.random.permutation(len(dataset))
-    else:
-        sample_ordering = np.arange(len(dataset))
-    for sample_idx in sample_ordering:
-        example = dataset[int(sample_idx)]
-        # Handle dicts with proper padding and conversion to tensor.
-        example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
-        if tokenizer.pad_token_id is not None:
-            example["labels"][example["attention_mask"] == 0] = -100
-        example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
-
-        yield example, example["labels"]  # TF needs some kind of labels, even if we don't use them
-    return
-
-
-# endregion
-
-
-# region Helper functions
-def dataset_to_tf(dataset, tokenizer, total_batch_size, num_epochs, shuffle):
-    train_generator = partial(sample_generator, dataset, tokenizer, shuffle=shuffle)
-    train_signature = {
-        feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
-        for feature in dataset.features
-        if feature != "special_tokens_mask"
-    }
-    # This may need to be changed depending on your particular model or tokenizer!
-    padding_values = {key: tf.convert_to_tensor(0, dtype=tf.int64) for key in dataset.features}
-    padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int64)
-    if tokenizer.pad_token_id is not None:
-        padding_values["input_ids"] = tf.convert_to_tensor(tokenizer.pad_token_id, dtype=tf.int64)
-    train_signature["labels"] = train_signature["input_ids"]
-    train_signature = (train_signature, train_signature["labels"])
-    options = tf.data.Options()
-    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-    tf_dataset = (
-        tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
-        .with_options(options)
-        .padded_batch(
-            batch_size=total_batch_size,
-            drop_remainder=True,
-            padding_values=(padding_values, np.array(0, dtype=np.int64)),
-        )
-        .repeat(int(num_epochs))
-    )
-    return tf_dataset
-
-
-# endregion
-
-
 def main():
     # region Argument Parsing
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
@@ -419,6 +361,14 @@ def tokenize_and_align_labels(examples):
     train_dataset = processed_raw_datasets["train"]
     eval_dataset = processed_raw_datasets["validation"]
 
+    if data_args.max_train_samples is not None:
+        max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+        train_dataset = train_dataset.select(range(max_train_samples))
+
+    if data_args.max_eval_samples is not None:
+        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+        eval_dataset = eval_dataset.select(range(max_eval_samples))
+
     # Log a few random samples from the training set:
     for index in random.sample(range(len(train_dataset)), 3):
         logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
@@ -439,43 +389,62 @@ def tokenize_and_align_labels(examples):
         # endregion
 
         # region Create TF datasets
+
+        # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
+        # well as inputs.
+        collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
         num_replicas = training_args.strategy.num_replicas_in_sync
         total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
-        train_batches_per_epoch = len(train_dataset) // total_train_batch_size
-        tf_train_dataset = dataset_to_tf(
+
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+
+        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+        # yourself if you use this method, whereas they are automatically inferred from the model input names when
+        # using model.prepare_tf_dataset()
+        # For more info see the docs:
+        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+        tf_train_dataset = model.prepare_tf_dataset(
             train_dataset,
-            tokenizer,
-            total_batch_size=total_train_batch_size,
-            num_epochs=training_args.num_train_epochs,
+            collate_fn=collate_fn,
+            batch_size=total_train_batch_size,
             shuffle=True,
-        )
+        ).with_options(dataset_options)
         total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
-        eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size
-        tf_eval_dataset = dataset_to_tf(
+        tf_eval_dataset = model.prepare_tf_dataset(
             eval_dataset,
-            tokenizer,
-            total_batch_size=total_eval_batch_size,
-            num_epochs=training_args.num_train_epochs,
+            collate_fn=collate_fn,
+            batch_size=total_eval_batch_size,
             shuffle=False,
-        )
+        ).with_options(dataset_options)
 
         # endregion
 
         # region Optimizer, loss and compilation
+        num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
+        if training_args.warmup_steps > 0:
+            num_warmup_steps = training_args.warmup_steps
+        elif training_args.warmup_ratio > 0:
+            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+        else:
+            num_warmup_steps = 0
+
         optimizer, lr_schedule = create_optimizer(
             init_lr=training_args.learning_rate,
-            num_train_steps=int(training_args.num_train_epochs * train_batches_per_epoch),
-            num_warmup_steps=training_args.warmup_steps,
+            num_train_steps=num_train_steps,
+            num_warmup_steps=num_warmup_steps,
             adam_beta1=training_args.adam_beta1,
             adam_beta2=training_args.adam_beta2,
             adam_epsilon=training_args.adam_epsilon,
             weight_decay_rate=training_args.weight_decay,
+            adam_global_clipnorm=training_args.max_grad_norm,
         )
 
-        def dummy_loss(y_true, y_pred):
-            return tf.reduce_mean(y_pred)
-
-        model.compile(loss={"loss": dummy_loss}, optimizer=optimizer)
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         # endregion
 
         # Metrics
@@ -517,6 +486,39 @@ def compute_metrics():
 
         # endregion
 
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            if data_args.dataset_name is not None:
+                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+            else:
+                push_to_hub_model_id = f"{model_name}-finetuned-token-classification"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
+        # endregion
+
         # region Training
         logger.info("***** Running training *****")
         logger.info(f"  Num examples = {len(train_dataset)}")
@@ -524,23 +526,43 @@ def compute_metrics():
         logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
         logger.info(f"  Total train batch size = {total_train_batch_size}")
         # Only show the progress bar once on each machine.
+
         model.fit(
             tf_train_dataset,
             validation_data=tf_eval_dataset,
             epochs=int(training_args.num_train_epochs),
-            steps_per_epoch=train_batches_per_epoch,
-            validation_steps=eval_batches_per_epoch,
+            callbacks=callbacks,
         )
         # endregion
 
         # region Predictions
-        # For predictions, we preload the entire validation set - note that if you have a really giant validation
-        # set, you might need to change this!
-        eval_inputs = {key: tf.ragged.constant(eval_dataset[key]).to_tensor() for key in eval_dataset.features}
-        predictions = model.predict(eval_inputs, batch_size=training_args.per_device_eval_batch_size)["logits"]
-        predictions = tf.math.argmax(predictions, axis=-1)
-        labels = np.array(eval_inputs["labels"])
-        labels[np.array(eval_inputs["attention_mask"]) == 0] = -100
+        # If you have variable batch sizes (i.e. not using pad_to_max_length), then
+        # this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq
+        # length from predict().
+
+        try:
+            predictions = model.predict(tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"]
+        except tf.python.framework.errors_impl.InvalidArgumentError:
+            raise ValueError(
+                "Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older "
+                "then you will need to use --pad_to_max_length to generate predictions, as older "
+                "versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor."
+            )
+        if isinstance(predictions, tf.RaggedTensor):
+            predictions = predictions.to_tensor(default_value=-100)
+        predictions = tf.math.argmax(predictions, axis=-1).numpy()
+        if "label" in eval_dataset:
+            labels = eval_dataset.with_format("tf")["label"]
+        else:
+            labels = eval_dataset.with_format("tf")["labels"]
+        if isinstance(labels, tf.RaggedTensor):
+            labels = labels.to_tensor(default_value=-100)
+        labels = labels.numpy()
+        attention_mask = eval_dataset.with_format("tf")["attention_mask"]
+        if isinstance(attention_mask, tf.RaggedTensor):
+            attention_mask = attention_mask.to_tensor(default_value=-100)
+        attention_mask = attention_mask.numpy()
+        labels[attention_mask == 0] = -100
         preds, refs = get_labels(predictions, labels)
         metric.add_batch(
             predictions=preds,
@@ -550,12 +572,15 @@ def compute_metrics():
         logger.info("Evaluation metrics:")
         for key, val in eval_metric.items():
             logger.info(f"{key}: {val:.4f}")
-        # endregion
 
-    # We don't do predictions in the strategy scope because there are some issues in there right now.
-    # They'll get fixed eventually, promise!
+        if training_args.output_dir is not None:
+            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+            with open(output_eval_file, "w") as writer:
+                writer.write(json.dumps(eval_metric))
+        # endregion
 
-    if training_args.output_dir is not None:
+    if training_args.output_dir is not None and not training_args.push_to_hub:
+        # If we're not pushing to hub, at least save a local copy when we're done
         model.save_pretrained(training_args.output_dir)
 
 
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 7f5eb9eb9defb7..7ccd089ca82dce 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -18,30 +18,32 @@
 """
 # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
 
+import json
 import logging
 import os
 import sys
 from dataclasses import dataclass, field
-from functools import partial
 from typing import Optional
 
 import datasets
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
-from tqdm import tqdm
 
 import evaluate
 import transformers
 from transformers import (
     AutoConfig,
     AutoTokenizer,
+    DataCollatorForSeq2Seq,
     HfArgumentParser,
+    KerasMetricCallback,
     M2M100Tokenizer,
     MBart50Tokenizer,
     MBart50TokenizerFast,
     MBartTokenizer,
     MBartTokenizerFast,
+    PushToHubCallback,
     TFAutoModelForSeq2SeqLM,
     TFTrainingArguments,
     create_optimizer,
@@ -224,6 +226,16 @@ class DataTrainingArguments:
     source_prefix: Optional[str] = field(
         default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
     )
+    forced_bos_token: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
+                " multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
+                " be the target language token.(Usually it is the target language token)"
+            )
+        },
+    )
 
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -239,70 +251,6 @@ def __post_init__(self):
             self.val_max_target_length = self.max_target_length
 
 
-# endregion
-
-# region Data generator
-def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
-    if shuffle:
-        sample_ordering = np.random.permutation(len(dataset))
-    else:
-        sample_ordering = np.arange(len(dataset))
-    for sample_idx in sample_ordering:
-        example = dataset[int(sample_idx)]
-        # Handle dicts with proper padding and conversion to tensor.
-        example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
-        example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
-        if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
-            decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
-                labels=tf.expand_dims(example["labels"], 0)
-            )
-            example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
-        yield example, example["labels"]  # TF needs some kind of labels, even if we don't use them
-    return
-
-
-# endregion
-
-
-# region Helper functions
-def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
-    if dataset is None:
-        return None
-    train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
-    train_signature = {
-        feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
-        for feature in dataset.features
-        if feature != "special_tokens_mask"
-    }
-    if (
-        model is not None
-        and "decoder_input_ids" not in train_signature
-        and hasattr(model, "prepare_decoder_input_ids_from_labels")
-    ):
-        train_signature["decoder_input_ids"] = train_signature["labels"]
-    # This may need to be changed depending on your particular model or tokenizer!
-    padding_values = {
-        key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
-        for key in train_signature.keys()
-    }
-    padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
-    train_signature["labels"] = train_signature["input_ids"]
-    train_signature = (train_signature, train_signature["labels"])
-    options = tf.data.Options()
-    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-    tf_dataset = (
-        tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
-        .with_options(options)
-        .padded_batch(
-            batch_size=total_batch_size,
-            drop_remainder=True,
-            padding_values=(padding_values, np.array(-100, dtype=np.int32)),
-        )
-        .repeat(int(num_epochs))
-    )
-    return tf_dataset
-
-
 # endregion
 
 
@@ -541,67 +489,149 @@ def preprocess_function(examples):
         # endregion
 
         # region Prepare TF Dataset objects
+        label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+        data_collator = DataCollatorForSeq2Seq(
+            tokenizer,
+            model=model,
+            label_pad_token_id=label_pad_token_id,
+            pad_to_multiple_of=64,  # Reduce the number of unique shapes for XLA, especially for generation
+            return_tensors="tf",
+        )
         num_replicas = training_args.strategy.num_replicas_in_sync
         total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
         total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
-        tf_train_dataset = dataset_to_tf(
+
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+
+        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+        # yourself if you use this method, whereas they are automatically inferred from the model input names when
+        # using model.prepare_tf_dataset()
+        # For more info see the docs:
+        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+        tf_train_dataset = model.prepare_tf_dataset(
             train_dataset,
-            model,
-            tokenizer,
-            total_batch_size=total_train_batch_size,
-            num_epochs=training_args.num_train_epochs,
+            collate_fn=data_collator,
+            batch_size=total_train_batch_size,
             shuffle=True,
-        )
-        tf_eval_dataset = dataset_to_tf(
-            eval_dataset,
-            model,
-            tokenizer,
-            total_eval_batch_size,
-            num_epochs=1,
-            shuffle=False,
-        )
+        ).with_options(dataset_options)
+        tf_eval_dataset = model.prepare_tf_dataset(
+            eval_dataset, collate_fn=data_collator, batch_size=total_eval_batch_size, shuffle=False
+        ).with_options(dataset_options)
         # endregion
 
-        # region Optimizer, loss and LR scheduling
-        # Scheduler and math around the number of training steps.
-        num_update_steps_per_epoch = len(train_dataset) // training_args.per_device_train_batch_size
-        num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
-        optimizer, lr_schedule = create_optimizer(
-            init_lr=training_args.learning_rate,
-            num_train_steps=num_train_steps,
-            num_warmup_steps=training_args.warmup_steps,
-        )
-
-        def masked_sparse_categorical_crossentropy(y_true, y_pred):
-            # We clip the negative labels to 0 to avoid NaNs appearing in the output and
-            # fouling up everything that comes afterwards. The loss values corresponding to clipped values
-            # will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
-            # 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
-            # event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
-            # More pragmatically, consider redesigning your tokenizer.
-            losses = tf.keras.losses.sparse_categorical_crossentropy(
-                tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
+        # region Optimizer and LR scheduling
+        num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
+        if training_args.warmup_steps > 0:
+            num_warmup_steps = training_args.warmup_steps
+        elif training_args.warmup_ratio > 0:
+            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+        else:
+            num_warmup_steps = 0
+        if training_args.do_train:
+            optimizer, lr_schedule = create_optimizer(
+                init_lr=training_args.learning_rate,
+                num_train_steps=num_train_steps,
+                num_warmup_steps=num_warmup_steps,
+                adam_beta1=training_args.adam_beta1,
+                adam_beta2=training_args.adam_beta2,
+                adam_epsilon=training_args.adam_epsilon,
+                weight_decay_rate=training_args.weight_decay,
+                adam_global_clipnorm=training_args.max_grad_norm,
             )
-            # Compute the per-sample loss only over the unmasked tokens
-            losses = tf.ragged.boolean_mask(losses, y_true != -100)
-            losses = tf.reduce_mean(losses, axis=-1)
-            return losses
-
+        else:
+            optimizer = None
         # endregion
 
         # region Metric and postprocessing
-        metric = evaluate.load("sacrebleu")
+        if training_args.do_eval:
+            metric = evaluate.load("sacrebleu")
 
-        def postprocess_text(preds, labels):
-            preds = [pred.strip() for pred in preds]
-            labels = [[label.strip()] for label in labels]
+            if data_args.val_max_target_length is None:
+                data_args.val_max_target_length = data_args.max_target_length
+
+            gen_kwargs = {
+                "max_length": data_args.val_max_target_length,
+                "num_beams": data_args.num_beams,
+                "no_repeat_ngram_size": 0,  # Not supported under XLA right now, and some models set it by default
+            }
+
+            def postprocess_text(preds, labels):
+                preds = [pred.strip() for pred in preds]
+                labels = [[label.strip()] for label in labels]
+
+                return preds, labels
+
+            def compute_metrics(preds):
+                predictions, labels = preds
+                if isinstance(predictions, tuple):
+                    predictions = predictions[0]
+                decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+                decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+                decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+                metrics = metric.compute(predictions=decoded_preds, references=decoded_labels)
+                return {"bleu": metrics["score"]}
+
+            # The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
+            # to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
+            # useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
+            # For more information, see the docs at
+            # https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
+
+            metric_callback = KerasMetricCallback(
+                metric_fn=compute_metrics,
+                eval_dataset=tf_eval_dataset,
+                predict_with_generate=True,
+                use_xla_generation=True,
+                generate_kwargs=gen_kwargs,
+            )
+            callbacks = [metric_callback]
+        else:
+            callbacks = []
 
-            return preds, labels
+        # endregion
 
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            push_to_hub_model_id = f"{model_name}-finetuned-{data_args.source_lang}-{data_args.target_lang}"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None]
+        if len(languages) > 0:
+            model_card_kwargs["language"] = languages
+
+        if training_args.push_to_hub:
+            # Because this training can be quite long, we save once per epoch.
+            callbacks.append(
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            )
         # endregion
 
         # region Training
-        model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
+        eval_metrics = None
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
 
         if training_args.do_train:
             logger.info("***** Running training *****")
@@ -611,41 +641,48 @@ def postprocess_text(preds, labels):
             logger.info(f"  Total train batch size = {total_train_batch_size}")
             logger.info(f"  Total optimization steps = {num_train_steps}")
 
-            model.fit(
-                tf_train_dataset,
-                epochs=int(training_args.num_train_epochs),
-                steps_per_epoch=num_update_steps_per_epoch,
-            )
+            if training_args.xla and not data_args.pad_to_max_length:
+                logger.warning(
+                    "XLA training may be slow at first when --pad_to_max_length is not set "
+                    "until all possible shapes have been compiled."
+                )
+
+            history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
+            eval_metrics = {key: val[-1] for key, val in history.history.items()}
         # endregion
 
         # region Validation
-        if data_args.val_max_target_length is None:
-            data_args.val_max_target_length = data_args.max_target_length
-
-        gen_kwargs = {
-            "max_length": data_args.val_max_target_length,
-            "num_beams": data_args.num_beams,
-        }
-        if training_args.do_eval:
-            logger.info("Evaluation...")
-            for batch, labels in tqdm(
-                tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
-            ):
-                batch.update(gen_kwargs)
-                generated_tokens = model.generate(**batch)
-                if isinstance(generated_tokens, tuple):
-                    generated_tokens = generated_tokens[0]
-                decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-                decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-                decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-
-                metric.add_batch(predictions=decoded_preds, references=decoded_labels)
-            eval_metric = metric.compute()
-            logger.info({"bleu": eval_metric["score"]})
+        if training_args.do_eval and not training_args.do_train:
+            # Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
+            @tf.function(jit_compile=True)
+            def generate(**kwargs):
+                return model.generate(**kwargs)
+
+            if training_args.do_eval:
+                logger.info("Evaluation...")
+                for batch, labels in tf_eval_dataset:
+                    batch.update(gen_kwargs)
+                    generated_tokens = generate(**batch)
+                    if isinstance(generated_tokens, tuple):
+                        generated_tokens = generated_tokens[0]
+                    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+                    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+                    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+                    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+                    metric.add_batch(predictions=decoded_preds, references=decoded_labels)
+
+                eval_metrics = metric.compute()
+                logger.info({"bleu": eval_metrics["score"]})
         # endregion
 
-        if training_args.output_dir is not None:
+        if training_args.output_dir is not None and eval_metrics is not None:
+            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+            with open(output_eval_file, "w") as writer:
+                writer.write(json.dumps(eval_metrics))
+
+        if training_args.output_dir is not None and not training_args.push_to_hub:
+            # If we're not pushing to hub, at least save a local copy when we're done
             model.save_pretrained(training_args.output_dir)
 
 
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 345b2eaf1f3aa8..e2b2a961ca1984 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -87,6 +87,8 @@ def create_optimizer(
     adam_beta1: float = 0.9,
     adam_beta2: float = 0.999,
     adam_epsilon: float = 1e-8,
+    adam_clipnorm: Optional[float] = None,
+    adam_global_clipnorm: Optional[float] = None,
     weight_decay_rate: float = 0.0,
     power: float = 1.0,
     include_in_weight_decay: Optional[List[str]] = None,
@@ -109,6 +111,11 @@ def create_optimizer(
             The beta2 to use in Adam.
         adam_epsilon (`float`, *optional*, defaults to 1e-8):
             The epsilon to use in Adam.
+        adam_clipnorm: (`float`, *optional*, defaults to `None`):
+            If not `None`, clip the gradient norm for each weight tensor to this value.
+        adam_global_clipnorm: (`float`, *optional*, defaults to `None`)
+            If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
+            weight tensors, as if they were concatenated into a single vector.
         weight_decay_rate (`float`, *optional*, defaults to 0):
             The weight decay to use.
         power (`float`, *optional*, defaults to 1.0):
@@ -137,12 +144,19 @@ def create_optimizer(
             beta_1=adam_beta1,
             beta_2=adam_beta2,
             epsilon=adam_epsilon,
+            clipnorm=adam_clipnorm,
+            global_clipnorm=adam_global_clipnorm,
             exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
             include_in_weight_decay=include_in_weight_decay,
         )
     else:
         optimizer = tf.keras.optimizers.Adam(
-            learning_rate=lr_schedule, beta_1=adam_beta1, beta_2=adam_beta2, epsilon=adam_epsilon
+            learning_rate=lr_schedule,
+            beta_1=adam_beta1,
+            beta_2=adam_beta2,
+            epsilon=adam_epsilon,
+            clipnorm=adam_clipnorm,
+            global_clipnorm=adam_global_clipnorm,
         )
     # We return the optimizer and the LR scheduler in order to better track the
     # evolution of the LR independently of the optimizer.
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index e662d6fca4fdaa..e9a9f8f0043a79 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -106,6 +106,7 @@ class OptimizerNames(ExplicitEnum):
 
 @dataclass
 class TrainingArguments:
+    framework = "pt"
     """
     TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
     itself**.
@@ -1039,25 +1040,25 @@ def __post_init__(self):
             self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
         if self.run_name is None:
             self.run_name = self.output_dir
+        if self.framework == "pt" and is_torch_available():
+            if self.fp16_backend and self.fp16_backend != "auto":
+                warnings.warn(
+                    "`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
+                    " `half_precision_backend` instead",
+                    FutureWarning,
+                )
+                self.half_precision_backend = self.fp16_backend
 
-        if self.fp16_backend and self.fp16_backend != "auto":
-            warnings.warn(
-                "`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
-                " `half_precision_backend` instead",
-                FutureWarning,
-            )
-            self.half_precision_backend = self.fp16_backend
-
-        if self.bf16 or self.bf16_full_eval:
+            if self.bf16 or self.bf16_full_eval:
 
-            if self.no_cuda and not is_torch_bf16_cpu_available():
-                # cpu
-                raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10")
-            elif not self.no_cuda and not is_torch_bf16_gpu_available():
-                # gpu
-                raise ValueError(
-                    "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
-                )
+                if self.no_cuda and not is_torch_bf16_cpu_available():
+                    # cpu
+                    raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10")
+                elif not self.no_cuda and not is_torch_bf16_gpu_available():
+                    # gpu
+                    raise ValueError(
+                        "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
+                    )
 
         if self.fp16 and self.bf16:
             raise ValueError("At most one of fp16 and bf16 can be True, but not both")
@@ -1084,7 +1085,8 @@ def __post_init__(self):
             self.optim = OptimizerNames.ADAFACTOR
 
         if (
-            is_torch_available()
+            self.framework == "pt"
+            and is_torch_available()
             and (self.device.type != "cuda")
             and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
             and (self.fp16 or self.fp16_full_eval)
@@ -1095,7 +1097,8 @@ def __post_init__(self):
             )
 
         if (
-            is_torch_available()
+            self.framework == "pt"
+            and is_torch_available()
             and (self.device.type != "cuda")
             and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
             and (self.device.type != "cpu")
@@ -1106,7 +1109,7 @@ def __post_init__(self):
                 " (`--bf16_full_eval`) can only be used on CUDA or CPU devices."
             )
 
-        if is_torch_available() and self.tf32 is not None:
+        if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
             if self.tf32:
                 if is_torch_tf32_available():
                     torch.backends.cuda.matmul.allow_tf32 = True
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 060b78e9220518..fdae51f72d4b56 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -28,6 +28,7 @@
 
 @dataclass
 class TFTrainingArguments(TrainingArguments):
+    framework = "tf"
     """
     TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
     itself**.
@@ -188,9 +189,6 @@ class TFTrainingArguments(TrainingArguments):
     def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]:
         logger.info("Tensorflow: setting up strategy")
 
-        if self.xla:
-            tf.config.optimizer.set_jit(True)
-
         gpus = tf.config.list_physical_devices("GPU")
 
         # Set to float16 at first

From c9c542043efee7aa8fb815ab94a38bb6c23f0376 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:55:18 -0400
Subject: [PATCH 101/162] Use commit hash to look in cache instead of calling
 head (#18534)

* Use commit hash to look in cache instead of calling head

* Add tests

* Add attr for local configs too

* Stupid typos

* Fix tests

* Update src/transformers/utils/hub.py

Co-authored-by: Julien Chaumond <julien@huggingface.co>

* Address Julien's comments

Co-authored-by: Julien Chaumond <julien@huggingface.co>
---
 src/transformers/configuration_utils.py       | 23 +++++++-
 src/transformers/modeling_flax_utils.py       |  7 +++
 src/transformers/modeling_tf_utils.py         |  7 +++
 src/transformers/modeling_utils.py            |  6 ++
 .../models/auto/tokenization_auto.py          | 15 ++++-
 src/transformers/pipelines/__init__.py        | 11 +++-
 src/transformers/testing_utils.py             | 28 +++++++++
 src/transformers/tokenization_utils_base.py   | 16 +++++-
 src/transformers/utils/__init__.py            |  1 +
 src/transformers/utils/hub.py                 | 57 +++++++++++++++----
 tests/models/auto/test_modeling_auto.py       | 19 +++++++
 tests/models/auto/test_modeling_tf_auto.py    | 19 +++++++
 tests/models/auto/test_tokenization_auto.py   | 12 ++++
 tests/pipelines/test_pipelines_common.py      | 11 ++++
 tests/test_configuration_common.py            | 12 ++--
 15 files changed, 221 insertions(+), 23 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index b924cec9ae021c..41503255ac2adb 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -27,7 +27,15 @@
 
 from . import __version__
 from .dynamic_module_utils import custom_object_save
-from .utils import CONFIG_NAME, PushToHubMixin, cached_file, copy_func, is_torch_available, logging
+from .utils import (
+    CONFIG_NAME,
+    PushToHubMixin,
+    cached_file,
+    copy_func,
+    extract_commit_hash,
+    is_torch_available,
+    logging,
+)
 
 
 logger = logging.get_logger(__name__)
@@ -343,6 +351,8 @@ def __init__(self, **kwargs):
 
         # Name or path to the pretrained checkpoint
         self._name_or_path = str(kwargs.pop("name_or_path", ""))
+        # Config hash
+        self._commit_hash = kwargs.pop("_commit_hash", None)
 
         # Drop the transformers version info
         self.transformers_version = kwargs.pop("transformers_version", None)
@@ -539,6 +549,8 @@ def get_config_dict(
         original_kwargs = copy.deepcopy(kwargs)
         # Get config dict associated with the base config file
         config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if "_commit_hash" in config_dict:
+            original_kwargs["_commit_hash"] = config_dict["_commit_hash"]
 
         # That config file may point us toward another config file to use.
         if "configuration_files" in config_dict:
@@ -564,6 +576,7 @@ def _get_config_dict(
         subfolder = kwargs.pop("subfolder", "")
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
+        commit_hash = kwargs.pop("_commit_hash", None)
 
         if trust_remote_code is True:
             logger.warning(
@@ -599,7 +612,9 @@ def _get_config_dict(
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
+                    _commit_hash=commit_hash,
                 )
+                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
             except EnvironmentError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                 # the original exception.
@@ -616,6 +631,7 @@ def _get_config_dict(
         try:
             # Load config dict
             config_dict = cls._dict_from_json_file(resolved_config_file)
+            config_dict["_commit_hash"] = commit_hash
         except (json.JSONDecodeError, UnicodeDecodeError):
             raise EnvironmentError(
                 f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file."
@@ -648,6 +664,9 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
         # We remove them so they don't appear in `return_unused_kwargs`.
         kwargs.pop("_from_auto", None)
         kwargs.pop("_from_pipeline", None)
+        # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
+        if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
+            kwargs["_commit_hash"] = config_dict["_commit_hash"]
 
         config = cls(**config_dict)
 
@@ -751,6 +770,8 @@ def to_dict(self) -> Dict[str, Any]:
             output["model_type"] = self.__class__.model_type
         if "_auto_class" in output:
             del output["_auto_class"]
+        if "_commit_hash" in output:
+            del output["_commit_hash"]
 
         # Transformers version when serializing the model
         output["transformers_version"] = __version__
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index af75b418cad23e..683e25631c0f44 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -595,6 +595,7 @@ def from_pretrained(
         from_auto_class = kwargs.pop("_from_auto", False)
         _do_init = kwargs.pop("_do_init", True)
         subfolder = kwargs.pop("subfolder", "")
+        commit_hash = kwargs.pop("_commit_hash", None)
 
         if trust_remote_code is True:
             logger.warning(
@@ -625,11 +626,15 @@ def from_pretrained(
                 revision=revision,
                 _from_auto=from_auto_class,
                 _from_pipeline=from_pipeline,
+                _commit_hash=commit_hash,
                 **kwargs,
             )
         else:
             model_kwargs = kwargs
 
+        if commit_hash is None:
+            commit_hash = getattr(config, "_commit_hash", None)
+
         # Add the dtype to model_kwargs
         model_kwargs["dtype"] = dtype
 
@@ -682,6 +687,7 @@ def from_pretrained(
                         revision=revision,
                         subfolder=subfolder,
                         _raise_exceptions_for_missing_entries=False,
+                        _commit_hash=commit_hash,
                     )
                     resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
@@ -748,6 +754,7 @@ def from_pretrained(
                 use_auth_token=use_auth_token,
                 user_agent=user_agent,
                 revision=revision,
+                _commit_hash=commit_hash,
             )
 
         # init random models
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 68ee4117a2f9db..3587354b9326a9 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2161,6 +2161,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         subfolder = kwargs.pop("subfolder", "")
+        commit_hash = kwargs.pop("_commit_hash", None)
 
         if trust_remote_code is True:
             logger.warning(
@@ -2191,11 +2192,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 revision=revision,
                 _from_auto=from_auto_class,
                 _from_pipeline=from_pipeline,
+                _commit_hash=commit_hash,
                 **kwargs,
             )
         else:
             model_kwargs = kwargs
 
+        if commit_hash is None:
+            commit_hash = getattr(config, "_commit_hash", None)
+
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
         # index of the files.
         is_sharded = False
@@ -2253,6 +2258,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         revision=revision,
                         subfolder=subfolder,
                         _raise_exceptions_for_missing_entries=False,
+                        _commit_hash=commit_hash,
                     )
                     resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
@@ -2320,6 +2326,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 use_auth_token=use_auth_token,
                 user_agent=user_agent,
                 revision=revision,
+                _commit_hash=commit_hash,
             )
 
         config.name_or_path = pretrained_model_name_or_path
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 1d895baecfedac..d77258c94ea089 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1840,6 +1840,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         load_in_8bit = kwargs.pop("load_in_8bit", False)
         int8_threshold = kwargs.pop("int8_threshold", 6.0)
         subfolder = kwargs.pop("subfolder", "")
+        commit_hash = kwargs.pop("_commit_hash", None)
 
         if trust_remote_code is True:
             logger.warning(
@@ -1918,6 +1919,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         else:
             model_kwargs = kwargs
 
+        if commit_hash is None:
+            commit_hash = getattr(config, "_commit_hash", None)
+
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
         # index of the files.
         is_sharded = False
@@ -2004,6 +2008,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         revision=revision,
                         subfolder=subfolder,
                         _raise_exceptions_for_missing_entries=False,
+                        _commit_hash=commit_hash,
                     )
                     resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
@@ -2078,6 +2083,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
+                _commit_hash=commit_hash,
             )
 
         # load pt weights early so that we know which dtype to init the model under
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index d8759fd4e7842e..8ece13b79fe3fa 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -25,7 +25,7 @@
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import get_file_from_repo, is_sentencepiece_available, is_tokenizers_available, logging
+from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging
 from ..encoder_decoder import EncoderDecoderConfig
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
@@ -389,7 +389,8 @@ def get_tokenizer_config(
     tokenizer.save_pretrained("tokenizer-test")
     tokenizer_config = get_tokenizer_config("tokenizer-test")
     ```"""
-    resolved_config_file = get_file_from_repo(
+    commit_hash = kwargs.get("_commit_hash", None)
+    resolved_config_file = cached_file(
         pretrained_model_name_or_path,
         TOKENIZER_CONFIG_FILE,
         cache_dir=cache_dir,
@@ -399,13 +400,19 @@ def get_tokenizer_config(
         use_auth_token=use_auth_token,
         revision=revision,
         local_files_only=local_files_only,
+        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
+        _commit_hash=commit_hash,
     )
     if resolved_config_file is None:
         logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
         return {}
+    commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
 
     with open(resolved_config_file, encoding="utf-8") as reader:
-        return json.load(reader)
+        result = json.load(reader)
+    result["_commit_hash"] = commit_hash
+    return result
 
 
 class AutoTokenizer:
@@ -532,6 +539,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
         # Next, let's try to use the tokenizer_config file to get the tokenizer class.
         tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
+        if "_commit_hash" in tokenizer_config:
+            kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
         config_tokenizer_class = tokenizer_config.get("tokenizer_class")
         tokenizer_auto_map = None
         if "auto_map" in tokenizer_config:
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index dfa75768d8f811..5752790aa9614b 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -557,7 +557,12 @@ def pipeline(
     # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs,
     # this is to keep BC).
     use_auth_token = model_kwargs.pop("use_auth_token", use_auth_token)
-    hub_kwargs = {"revision": revision, "use_auth_token": use_auth_token, "trust_remote_code": trust_remote_code}
+    hub_kwargs = {
+        "revision": revision,
+        "use_auth_token": use_auth_token,
+        "trust_remote_code": trust_remote_code,
+        "_commit_hash": None,
+    }
 
     if task is None and model is None:
         raise RuntimeError(
@@ -583,8 +588,10 @@ def pipeline(
     # Instantiate config if needed
     if isinstance(config, str):
         config = AutoConfig.from_pretrained(config, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+        hub_kwargs["_commit_hash"] = config._commit_hash
     elif config is None and isinstance(model, str):
         config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+        hub_kwargs["_commit_hash"] = config._commit_hash
 
     custom_tasks = {}
     if config is not None and len(getattr(config, "custom_pipelines", {})) > 0:
@@ -639,6 +646,7 @@ def pipeline(
         )
         if config is None and isinstance(model, str):
             config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+            hub_kwargs["_commit_hash"] = config._commit_hash
 
     if device_map is not None:
         if "device_map" in model_kwargs:
@@ -672,6 +680,7 @@ def pipeline(
     )
 
     model_config = model.config
+    hub_kwargs["_commit_hash"] = model.config._commit_hash
 
     load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 80f7bf9c863c87..d21f353a60a8f5 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -31,6 +31,7 @@
 from typing import Iterator, List, Union
 from unittest import mock
 
+import huggingface_hub
 from transformers import logging as transformers_logging
 
 from .deepspeed import is_deepspeed_available
@@ -1588,3 +1589,30 @@ def run_command(command: List[str], return_stdout=False):
         raise SubprocessCallException(
             f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
         ) from e
+
+
+class RequestCounter:
+    """
+    Helper class that will count all requests made online.
+    """
+
+    def __enter__(self):
+        self.head_request_count = 0
+        self.get_request_count = 0
+        self.other_request_count = 0
+        self.old_request = huggingface_hub.file_download.requests.request
+        huggingface_hub.file_download.requests.request = self.new_request
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        huggingface_hub.file_download.requests.request = self.old_request
+
+    def new_request(self, method, **kwargs):
+        if method == "GET":
+            self.get_request_count += 1
+        elif method == "HEAD":
+            self.head_request_count += 1
+        else:
+            self.other_request_count += 1
+
+        return self.old_request(method=method, **kwargs)
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index f85dc73cb659cb..566fd3fbf92b05 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -42,7 +42,7 @@
     add_end_docstrings,
     cached_file,
     copy_func,
-    get_file_from_repo,
+    extract_commit_hash,
     is_flax_available,
     is_offline_mode,
     is_tf_available,
@@ -1651,6 +1651,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         subfolder = kwargs.pop("subfolder", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
+        commit_hash = kwargs.pop("_commit_hash", None)
 
         user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
         if from_pipeline is not None:
@@ -1690,7 +1691,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             if "tokenizer_file" in vocab_files:
                 # Try to get the tokenizer config to see if there are versioned tokenizer files.
                 fast_tokenizer_file = FULL_TOKENIZER_FILE
-                resolved_config_file = get_file_from_repo(
+                resolved_config_file = cached_file(
                     pretrained_model_name_or_path,
                     TOKENIZER_CONFIG_FILE,
                     cache_dir=cache_dir,
@@ -1701,7 +1702,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     revision=revision,
                     local_files_only=local_files_only,
                     subfolder=subfolder,
+                    user_agent=user_agent,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                    _commit_hash=commit_hash,
                 )
+                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
                 if resolved_config_file is not None:
                     with open(resolved_config_file, encoding="utf-8") as reader:
                         tokenizer_config = json.load(reader)
@@ -1730,7 +1736,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     subfolder=subfolder,
                     _raise_exceptions_for_missing_entries=False,
                     _raise_exceptions_for_connection_errors=False,
+                    _commit_hash=commit_hash,
                 )
+                commit_hash = extract_commit_hash(resolved_vocab_files[file_id], commit_hash)
 
         if len(unresolved_files) > 0:
             logger.info(
@@ -1763,6 +1771,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             use_auth_token=use_auth_token,
             cache_dir=cache_dir,
             local_files_only=local_files_only,
+            _commit_hash=commit_hash,
             **kwargs,
         )
 
@@ -1776,6 +1785,7 @@ def _from_pretrained(
         use_auth_token=None,
         cache_dir=None,
         local_files_only=False,
+        _commit_hash=None,
         **kwargs
     ):
         # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
@@ -1791,6 +1801,7 @@ def _from_pretrained(
                 use_auth_token=use_auth_token,
                 cache_dir=cache_dir,
                 local_files_only=local_files_only,
+                _commit_hash=_commit_hash,
                 **(copy.deepcopy(kwargs)),
             )
         else:
@@ -1823,6 +1834,7 @@ def _from_pretrained(
                     use_auth_token=use_auth_token,
                     cache_dir=cache_dir,
                     local_files_only=local_files_only,
+                    _commit_hash=_commit_hash,
                 )
                 config_tokenizer_class = config.tokenizer_class
             except (OSError, ValueError, KeyError):
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index c778489e5e3995..1ac2622d8a3f17 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -63,6 +63,7 @@
     cached_file,
     default_cache_path,
     define_sagemaker_information,
+    extract_commit_hash,
     get_cached_models,
     get_file_from_repo,
     get_full_repo_name,
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 07164e735db901..00f9c277c41773 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -38,6 +38,7 @@
     whoami,
 )
 from huggingface_hub.constants import HUGGINGFACE_HEADER_X_LINKED_ETAG, HUGGINGFACE_HEADER_X_REPO_COMMIT
+from huggingface_hub.file_download import REGEX_COMMIT_HASH
 from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
 from requests.exceptions import HTTPError
 from transformers.utils.logging import tqdm
@@ -200,11 +201,27 @@ def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
     return ua
 
 
-def try_to_load_from_cache(cache_dir, repo_id, filename, revision=None):
+def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str]):
+    """
+    Extracts the commit hash from a resolved filename toward a cache file.
+    """
+    if resolved_file is None or commit_hash is not None:
+        return commit_hash
+
+    search = re.search(r"snapshots/([^/]+)/", resolved_file)
+    if search is None:
+        return None
+    commit_hash = search.groups()[0]
+    return commit_hash if REGEX_COMMIT_HASH.match(commit_hash) else None
+
+
+def try_to_load_from_cache(cache_dir, repo_id, filename, revision=None, commit_hash=None):
     """
     Explores the cache to return the latest cached file for a given revision.
     """
-    if revision is None:
+    if commit_hash is not None and revision is not None:
+        raise ValueError("`commit_hash` and `revision` are mutually exclusive, pick one only.")
+    if revision is None and commit_hash is None:
         revision = "main"
 
     model_id = repo_id.replace("/", "--")
@@ -216,18 +233,19 @@ def try_to_load_from_cache(cache_dir, repo_id, filename, revision=None):
         if not os.path.isdir(os.path.join(model_cache, subfolder)):
             return None
 
-    # Resolve refs (for instance to convert main to the associated commit sha)
-    cached_refs = os.listdir(os.path.join(model_cache, "refs"))
-    if revision in cached_refs:
-        with open(os.path.join(model_cache, "refs", revision)) as f:
-            revision = f.read()
+    if commit_hash is None:
+        # Resolve refs (for instance to convert main to the associated commit sha)
+        cached_refs = os.listdir(os.path.join(model_cache, "refs"))
+        if revision in cached_refs:
+            with open(os.path.join(model_cache, "refs", revision)) as f:
+                commit_hash = f.read()
 
     cached_shas = os.listdir(os.path.join(model_cache, "snapshots"))
-    if revision not in cached_shas:
+    if commit_hash not in cached_shas:
         # No cache for this revision and we won't try to return a random revision
         return None
 
-    cached_file = os.path.join(model_cache, "snapshots", revision, filename)
+    cached_file = os.path.join(model_cache, "snapshots", commit_hash, filename)
     return cached_file if os.path.isfile(cached_file) else None
 
 
@@ -265,8 +283,9 @@ def cached_file(
     local_files_only: bool = False,
     subfolder: str = "",
     user_agent: Optional[Union[str, Dict[str, str]]] = None,
-    _raise_exceptions_for_missing_entries=True,
-    _raise_exceptions_for_connection_errors=True,
+    _raise_exceptions_for_missing_entries: bool = True,
+    _raise_exceptions_for_connection_errors: bool = True,
+    _commit_hash: Optional[str] = None,
 ):
     """
     Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
@@ -318,6 +337,13 @@ def cached_file(
     # Download a model weight from the Hub and cache it.
     model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
     ```"""
+    # Private arguments
+    #     _raise_exceptions_for_missing_entries: if False, do not raise an exception for missing entries but return
+    #         None.
+    #     _raise_exceptions_for_connection_errors: if False, do not raise an exception for connection errors but return
+    #         None.
+    #     _commit_hash: passed when we are chaining several calls to various files (e.g. when loading a tokenizer or
+    #         a pipeline). If files are cached for this commit hash, avoid calls to head and get from the cache.
     if is_offline_mode() and not local_files_only:
         logger.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
@@ -339,6 +365,13 @@ def cached_file(
         cache_dir = TRANSFORMERS_CACHE
     if isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
+
+    if _commit_hash is not None:
+        # If the file is cached under that commit hash, we return it directly.
+        resolved_file = try_to_load_from_cache(cache_dir, path_or_repo_id, full_filename, commit_hash=_commit_hash)
+        if resolved_file is not None:
+            return resolved_file
+
     user_agent = http_user_agent(user_agent)
     try:
         # Load from URL or cache if already cached
@@ -803,6 +836,7 @@ def get_checkpoint_shard_files(
     user_agent=None,
     revision=None,
     subfolder="",
+    _commit_hash=None,
 ):
     """
     For a given model:
@@ -848,6 +882,7 @@ def get_checkpoint_shard_files(
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
+                _commit_hash=_commit_hash,
             )
         # We have already dealt with RepositoryNotFoundError and RevisionNotFoundError when getting the index, so
         # we don't have to catch them here.
diff --git a/tests/models/auto/test_modeling_auto.py b/tests/models/auto/test_modeling_auto.py
index 3731d70f5bb5af..2e1e51a81daac6 100644
--- a/tests/models/auto/test_modeling_auto.py
+++ b/tests/models/auto/test_modeling_auto.py
@@ -24,6 +24,7 @@
 from transformers.testing_utils import (
     DUMMY_UNKNOWN_IDENTIFIER,
     SMALL_MODEL_IDENTIFIER,
+    RequestCounter,
     require_scatter,
     require_torch,
     slow,
@@ -354,3 +355,21 @@ def test_model_from_tf_suggestion(self):
     def test_model_from_flax_suggestion(self):
         with self.assertRaisesRegex(EnvironmentError, "Use `from_flax=True` to load this model"):
             _ = AutoModel.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
+
+    def test_cached_model_has_minimum_calls_to_head(self):
+        # Make sure we have cached the model.
+        _ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with RequestCounter() as counter:
+            _ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            self.assertEqual(counter.get_request_count, 0)
+            self.assertEqual(counter.head_request_count, 1)
+            self.assertEqual(counter.other_request_count, 0)
+
+        # With a sharded checkpoint
+        _ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
+        with RequestCounter() as counter:
+            _ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
+            self.assertEqual(counter.get_request_count, 0)
+            # There is no pytorch_model.bin so we still get one call for this one.
+            self.assertEqual(counter.head_request_count, 2)
+            self.assertEqual(counter.other_request_count, 0)
diff --git a/tests/models/auto/test_modeling_tf_auto.py b/tests/models/auto/test_modeling_tf_auto.py
index a803a3451107e2..bbde4f582bdfb0 100644
--- a/tests/models/auto/test_modeling_tf_auto.py
+++ b/tests/models/auto/test_modeling_tf_auto.py
@@ -21,6 +21,7 @@
 from transformers.testing_utils import (
     DUMMY_UNKNOWN_IDENTIFIER,
     SMALL_MODEL_IDENTIFIER,
+    RequestCounter,
     require_tensorflow_probability,
     require_tf,
     slow,
@@ -287,3 +288,21 @@ def test_model_file_not_found(self):
     def test_model_from_pt_suggestion(self):
         with self.assertRaisesRegex(EnvironmentError, "Use `from_pt=True` to load this model"):
             _ = TFAutoModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")
+
+    def test_cached_model_has_minimum_calls_to_head(self):
+        # Make sure we have cached the model.
+        _ = TFAutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with RequestCounter() as counter:
+            _ = TFAutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            self.assertEqual(counter.get_request_count, 0)
+            self.assertEqual(counter.head_request_count, 1)
+            self.assertEqual(counter.other_request_count, 0)
+
+        # With a sharded checkpoint
+        _ = TFAutoModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
+        with RequestCounter() as counter:
+            _ = TFAutoModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
+            self.assertEqual(counter.get_request_count, 0)
+            # There is no pytorch_model.bin so we still get one call for this one.
+            self.assertEqual(counter.head_request_count, 2)
+            self.assertEqual(counter.other_request_count, 0)
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index 1e1abb9245842c..830362e29cd654 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -48,6 +48,7 @@
     DUMMY_DIFF_TOKENIZER_IDENTIFIER,
     DUMMY_UNKNOWN_IDENTIFIER,
     SMALL_MODEL_IDENTIFIER,
+    RequestCounter,
     require_tokenizers,
     slow,
 )
@@ -213,6 +214,7 @@ def test_auto_tokenizer_fast_no_slow(self):
     def test_get_tokenizer_config(self):
         # Check we can load the tokenizer config of an online model.
         config = get_tokenizer_config("bert-base-cased")
+        _ = config.pop("_commit_hash", None)
         # If we ever update bert-base-cased tokenizer config, this dict here will need to be updated.
         self.assertEqual(config, {"do_lower_case": False})
 
@@ -340,3 +342,13 @@ def test_revision_not_found(self):
             EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
         ):
             _ = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
+
+    def test_cached_tokenizer_has_minimum_calls_to_head(self):
+        # Make sure we have cached the tokenizer.
+        _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with RequestCounter() as counter:
+            _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+            self.assertEqual(counter.get_request_count, 0)
+            # We still have one extra call because the model does not have a added_tokens.json file
+            self.assertEqual(counter.head_request_count, 2)
+            self.assertEqual(counter.other_request_count, 0)
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 5d5c8fa2333eb6..5e0296c7136725 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -49,6 +49,7 @@
     TOKEN,
     USER,
     CaptureLogger,
+    RequestCounter,
     is_pipeline_test,
     is_staging_test,
     nested_simplify,
@@ -877,6 +878,16 @@ def test_dynamic_pipeline(self):
             [{"label": "LABEL_0", "score": 0.505}],
         )
 
+    def test_cached_pipeline_has_minimum_calls_to_head(self):
+        # Make sure we have cached the pipeline.
+        _ = pipeline("text-classification", model="hf-internal-testing/tiny-random-bert")
+        with RequestCounter() as counter:
+            _ = pipeline("text-classification", model="hf-internal-testing/tiny-random-bert")
+            self.assertEqual(counter.get_request_count, 0)
+            # We still have one extra call because the model does not have a added_tokens.json file
+            self.assertEqual(counter.head_request_count, 2)
+            self.assertEqual(counter.other_request_count, 0)
+
 
 @require_torch
 @is_staging_test
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 397346c7deec77..5447fb6afb70eb 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -246,7 +246,7 @@ def test_push_to_hub(self):
         config.push_to_hub("test-config", use_auth_token=self._token)
 
         new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.__dict__.items():
+        for k, v in config.to_dict().items():
             if k != "transformers_version":
                 self.assertEqual(v, getattr(new_config, k))
 
@@ -258,7 +258,7 @@ def test_push_to_hub(self):
             config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, use_auth_token=self._token)
 
         new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.__dict__.items():
+        for k, v in config.to_dict().items():
             if k != "transformers_version":
                 self.assertEqual(v, getattr(new_config, k))
 
@@ -269,7 +269,7 @@ def test_push_to_hub_in_organization(self):
         config.push_to_hub("valid_org/test-config-org", use_auth_token=self._token)
 
         new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.__dict__.items():
+        for k, v in config.to_dict().items():
             if k != "transformers_version":
                 self.assertEqual(v, getattr(new_config, k))
 
@@ -283,7 +283,7 @@ def test_push_to_hub_in_organization(self):
             )
 
         new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.__dict__.items():
+        for k, v in config.to_dict().items():
             if k != "transformers_version":
                 self.assertEqual(v, getattr(new_config, k))
 
@@ -323,7 +323,9 @@ def test_config_common_kwargs_is_complete(self):
         base_config = PretrainedConfig()
         missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
         # If this part of the test fails, you have arguments to addin config_common_kwargs above.
-        self.assertListEqual(missing_keys, ["is_encoder_decoder", "_name_or_path", "transformers_version"])
+        self.assertListEqual(
+            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
+        )
         keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
         if len(keys_with_defaults) > 0:
             raise ValueError(

From 5d39088435902e3eb768a9961c914d55df5a6042 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <julien@huggingface.co>
Date: Wed, 10 Aug 2022 18:52:15 +0200
Subject: [PATCH 102/162] `pipeline` support for `device="mps"` (or any other
 string) (#18494)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* `pipeline` support for `device="mps"` (or any other string)

* Simplify `if` nesting

* Update src/transformers/pipelines/base.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Fix? @sgugger

* passing `attr=None` is not the same as not passing `attr` 🤯

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/pipelines/__init__.py |  7 +++++++
 src/transformers/pipelines/base.py     | 19 +++++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 5752790aa9614b..74f6e796801c7e 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -422,6 +422,7 @@ def pipeline(
     revision: Optional[str] = None,
     use_fast: bool = True,
     use_auth_token: Optional[Union[str, bool]] = None,
+    device: Optional[Union[int, str, "torch.device"]] = None,
     device_map=None,
     torch_dtype=None,
     trust_remote_code: Optional[bool] = None,
@@ -508,6 +509,9 @@ def pipeline(
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
+        device (`int` or `str` or `torch.device`):
+            Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
+            pipeline will be allocated.
         device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
             Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
             `device_map="auto"` to compute the most optimized `device_map` automatically. [More
@@ -811,4 +815,7 @@ def pipeline(
     if feature_extractor is not None:
         kwargs["feature_extractor"] = feature_extractor
 
+    if device is not None:
+        kwargs["device"] = device
+
     return pipeline_class(model=model, framework=framework, task=task, **kwargs)
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 6e2c28e5ddf84d..a0ce06ec5e33f1 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -704,7 +704,7 @@ def predict(self, X):
             Reference to the object in charge of parsing supplied pipeline parameters.
         device (`int`, *optional*, defaults to -1):
             Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
-            the associated CUDA device id. You can pass native `torch.device` too.
+            the associated CUDA device id. You can pass native `torch.device` or a `str` too.
         binary_output (`bool`, *optional*, defaults to `False`):
             Flag indicating if the output the pipeline should happen in a binary format (i.e., pickle) or as raw text.
 """
@@ -747,7 +747,7 @@ def __init__(
         framework: Optional[str] = None,
         task: str = "",
         args_parser: ArgumentHandler = None,
-        device: int = -1,
+        device: Union[int, str, "torch.device"] = -1,
         binary_output: bool = False,
         **kwargs,
     ):
@@ -760,14 +760,21 @@ def __init__(
         self.feature_extractor = feature_extractor
         self.modelcard = modelcard
         self.framework = framework
-        if is_torch_available() and isinstance(device, torch.device):
-            self.device = device
+        if is_torch_available() and self.framework == "pt":
+            if isinstance(device, torch.device):
+                self.device = device
+            elif isinstance(device, str):
+                self.device = torch.device(device)
+            elif device < 0:
+                self.device = torch.device("cpu")
+            else:
+                self.device = torch.device("cuda:{device}")
         else:
-            self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else f"cuda:{device}")
+            self.device = device
         self.binary_output = binary_output
 
         # Special handling
-        if self.framework == "pt" and self.device.type == "cuda":
+        if self.framework == "pt" and self.device.type != "cpu":
             self.model = self.model.to(self.device)
 
         # Update config with task specific parameters

From 0544879b10c5db95b5cc7cdab0f35430b0a5d63e Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:20:39 -0700
Subject: [PATCH 103/162] Update philosophy to include other preprocessing
 classes (#18550)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 📝 update philosophy to include other preprocessing classes

* 🖍 apply feedbacks
---
 docs/source/en/philosophy.mdx | 53 ++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/docs/source/en/philosophy.mdx b/docs/source/en/philosophy.mdx
index 13134c31d4a6b9..1aca1accab9304 100644
--- a/docs/source/en/philosophy.mdx
+++ b/docs/source/en/philosophy.mdx
@@ -14,29 +14,28 @@ specific language governing permissions and limitations under the License.
 
 🤗 Transformers is an opinionated library built for:
 
-- NLP researchers and educators seeking to use/study/extend large-scale transformers models
-- hands-on practitioners who want to fine-tune those models and/or serve them in production
-- engineers who just want to download a pretrained model and use it to solve a given NLP task.
+- machine learning researchers and educators seeking to use, study or extend large-scale Transformers models.
+- hands-on practitioners who want to fine-tune those models or serve them in production, or both.
+- engineers who just want to download a pretrained model and use it to solve a given machine learning task.
 
 The library was designed with two strong goals in mind:
 
-- Be as easy and fast to use as possible:
+1. Be as easy and fast to use as possible:
 
   - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions,
     just three standard classes required to use each model: [configuration](main_classes/configuration),
-    [models](main_classes/model) and [tokenizer](main_classes/tokenizer).
+    [models](main_classes/model), and a preprocessing class ([tokenizer](main_classes/tokenizer) for NLP, [feature extractor](main_classes/feature_extractor) for vision and audio, and [processor](main_classes/processors) for multimodal inputs).
   - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common
-    `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and
-    loading the related class instance and associated data (configurations' hyper-parameters, tokenizers' vocabulary,
+    `from_pretrained()` method which downloads (if needed), caches and
+    loads the related class instance and associated data (configurations' hyperparameters, tokenizers' vocabulary,
     and models' weights) from a pretrained checkpoint provided on [Hugging Face Hub](https://huggingface.co/models) or your own saved checkpoint.
   - On top of those three base classes, the library provides two APIs: [`pipeline`] for quickly
-    using a model (plus its associated tokenizer and configuration) on a given task and
-    [`Trainer`]/`Keras.fit` to quickly train or fine-tune a given model.
+    using a model for inference on a given task and [`Trainer`] to quickly train or fine-tune a PyTorch model (all TensorFlow models are compatible with `Keras.fit`).
   - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to
-    extend/build-upon the library, just use regular Python/PyTorch/TensorFlow/Keras modules and inherit from the base
-    classes of the library to reuse functionalities like model loading/saving.
+    extend or build upon the library, just use regular Python, PyTorch, TensorFlow, Keras modules and inherit from the base
+    classes of the library to reuse functionalities like model loading and saving. If you'd like to learn more about our coding philosophy for models, check out our [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy) blog post.
 
-- Provide state-of-the-art models with performances as close as possible to the original models:
+2. Provide state-of-the-art models with performances as close as possible to the original models:
 
   - We provide at least one example for each architecture which reproduces a result provided by the official authors
     of said architecture.
@@ -48,33 +47,29 @@ A few other goals:
 - Expose the models' internals as consistently as possible:
 
   - We give access, using a single API, to the full hidden-states and attention weights.
-  - Tokenizer and base model's API are standardized to easily switch between models.
+  - The preprocessing classes and base model APIs are standardized to easily switch between models.
 
-- Incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
+- Incorporate a subjective selection of promising tools for fine-tuning and investigating these models:
 
-  - A simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning.
-  - Simple ways to mask and prune transformer heads.
+  - A simple and consistent way to add new tokens to the vocabulary and embeddings for fine-tuning.
+  - Simple ways to mask and prune Transformer heads.
 
-- Switch easily between PyTorch and TensorFlow 2.0, allowing training using one framework and inference using another.
+- Easily switch between PyTorch, TensorFlow 2.0 and Flax, allowing training with one framework and inference with another.
 
 ## Main concepts
 
 The library is built around three types of classes for each model:
 
-- **Model classes** such as [`BertModel`], which are 30+ PyTorch models ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)) or Keras models ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) that work with the pretrained weights provided in the
-  library.
-- **Configuration classes** such as [`BertConfig`], which store all the parameters required to build
-  a model. You don't always need to instantiate these yourself. In particular, if you are using a pretrained model
-  without any modification, creating the model will automatically take care of instantiating the configuration (which
-  is part of the model).
-- **Tokenizer classes** such as [`BertTokenizer`], which store the vocabulary for each model and
-  provide methods for encoding/decoding strings in a list of token embeddings indices to be fed to a model.
+- **Model classes** can be PyTorch models ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)), Keras models ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)) or JAX/Flax models ([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen.html)) that work with the pretrained weights provided in the library.
+- **Configuration classes** store the hyperparameters required to build a model (such as the number of layers and hidden size). You don't always need to instantiate these yourself. In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model).
+- **Preprocessing classes** convert the raw data into a format accepted by the model. A [tokenizer](main_classes/tokenizer) stores the vocabulary for each model and provide methods for encoding and decoding strings in a list of token embedding indices to be fed to a model. [Feature extractors](main_classes/feature_extractor) preprocess audio or vision inputs, and a [processor](main_classes/processors) handles multimodal inputs.
 
-All these classes can be instantiated from pretrained instances and saved locally using two methods:
+All these classes can be instantiated from pretrained instances, saved locally, and shared on the Hub with three methods:
 
-- `from_pretrained()` lets you instantiate a model/configuration/tokenizer from a pretrained version either
+- `from_pretrained()` lets you instantiate a model, configuration, and preprocessing class from a pretrained version either
   provided by the library itself (the supported models can be found on the [Model Hub](https://huggingface.co/models)) or
-  stored locally (or on a server) by the user,
-- `save_pretrained()` lets you save a model/configuration/tokenizer locally so that it can be reloaded using
+  stored locally (or on a server) by the user.
+- `save_pretrained()` lets you save a model, configuration, and preprocessing class locally so that it can be reloaded using
   `from_pretrained()`.
+- `push_to_hub()` lets you share a model, configuration, and a preprocessing class to the Hub, so it is easily accessible to everyone.
 

From 8b9873312e4a693243e31ccea1cdbefa6b226b26 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 Aug 2022 15:46:03 -0400
Subject: [PATCH 104/162] Properly move cache when it is not in default path
 (#18563)

---
 src/transformers/utils/hub.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 00f9c277c41773..163ad64ffa173b 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -1070,7 +1070,11 @@ def move_cache(cache_dir=None, new_cache_dir=None, token=None):
             "`transformers.utils.move_cache()`."
         )
     try:
-        move_cache()
+        if TRANSFORMERS_CACHE != default_cache_path:
+            # Users set some env variable to customize cache storage
+            move_cache(TRANSFORMERS_CACHE, TRANSFORMERS_CACHE)
+        else:
+            move_cache()
     except Exception as e:
         trace = "\n".join(traceback.format_tb(e.__traceback__))
         logger.error(

From c2fc948fb0a2c359537de37dc606673fb4ccfd2e Mon Sep 17 00:00:00 2001
From: Dhruv Karan <k4r4n.dhruv@gmail.com>
Date: Thu, 11 Aug 2022 01:17:31 +0530
Subject: [PATCH 105/162] Adds CLIP to models exportable with ONNX (#18515)

* onnx config for clip

* default opset as 14

* changes from the original repo

* input values order fix

* outputs fix

* remove unused import

* ran make fix-copies

* black format

* review comments: forward ref, import fix, model change revert, .to cleanup

* make style

* formatting fixes

* revert groupvit

* comment for cast to int32

* comment fix

* make .T as .t() for onnx conversion

* ran make fix-copies

* remove unneeded comment

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* fix copies

* remove comment

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/serialization.mdx              |  1 +
 src/transformers/models/clip/__init__.py      | 16 +++++-
 .../models/clip/configuration_clip.py         | 50 ++++++++++++++++++-
 src/transformers/models/clip/modeling_clip.py |  9 ++--
 .../models/groupvit/modeling_groupvit.py      |  7 ++-
 .../models/owlvit/modeling_owlvit.py          |  2 +-
 .../modeling_vision_text_dual_encoder.py      |  2 +-
 src/transformers/onnx/features.py             |  4 ++
 tests/onnx/test_onnx_v2.py                    |  1 +
 9 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 9561bbd8ec77c1..0aacdf76f7ef0f 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -55,6 +55,7 @@ Ready-made configurations include the following architectures:
 - BlenderbotSmall
 - BLOOM
 - CamemBERT
+- CLIP
 - CodeGen
 - ConvBERT
 - ConvNeXT
diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py
index 6a6e64c995d385..932130f8d5fdf9 100644
--- a/src/transformers/models/clip/__init__.py
+++ b/src/transformers/models/clip/__init__.py
@@ -29,7 +29,13 @@
 
 
 _import_structure = {
-    "configuration_clip": ["CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "CLIPConfig", "CLIPTextConfig", "CLIPVisionConfig"],
+    "configuration_clip": [
+        "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CLIPConfig",
+        "CLIPOnnxConfig",
+        "CLIPTextConfig",
+        "CLIPVisionConfig",
+    ],
     "tokenization_clip": ["CLIPTokenizer"],
 }
 
@@ -95,7 +101,13 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_clip import CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+    from .configuration_clip import (
+        CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CLIPConfig,
+        CLIPOnnxConfig,
+        CLIPTextConfig,
+        CLIPVisionConfig,
+    )
     from .tokenization_clip import CLIPTokenizer
 
     try:
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 3bb22b74a4c77c..a118b179e4c09f 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -16,9 +16,16 @@
 
 import copy
 import os
-from typing import Union
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
 
 from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging
 
 
@@ -317,3 +324,44 @@ def to_dict(self):
         output["vision_config"] = self.vision_config.to_dict()
         output["model_type"] = self.__class__.model_type
         return output
+
+
+class CLIPOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+
+        text_input_dict = super().generate_dummy_inputs(processor.tokenizer, framework=framework)
+        image_input_dict = super().generate_dummy_inputs(processor.feature_extractor, framework=framework)
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index ddc2236371c29a..799d0ef0462afc 100755
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -68,7 +68,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
 
 def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
     caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.T)
+    image_loss = contrastive_loss(similarity.t())
     return (caption_loss + image_loss) / 2.0
 
 
@@ -660,7 +660,10 @@ def forward(
 
         # text_embeds.shape = [batch_size, sequence_length, transformer.width]
         # take features from the eot embedding (eot_token is the highest number in each sequence)
-        pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1)
+        ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
@@ -1050,7 +1053,7 @@ def forward(
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
         logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
-        logits_per_image = logits_per_text.T
+        logits_per_image = logits_per_text.t()
 
         loss = None
         if return_loss:
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index 1073d4bfea8708..9817065ab37a55 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -72,7 +72,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
 # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->groupvit
 def groupvit_loss(similarity: torch.Tensor) -> torch.Tensor:
     caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.T)
+    image_loss = contrastive_loss(similarity.t())
     return (caption_loss + image_loss) / 2.0
 
 
@@ -1132,7 +1132,10 @@ def forward(
 
         # text_embeds.shape = [batch_size, sequence_length, transformer.width]
         # take features from the eot embedding (eot_token is the highest number in each sequence)
-        pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1)
+        ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 35ebd16cf25bd8..73ee2597f1b163 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -71,7 +71,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
 # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlvit
 def owlvit_loss(similarity: torch.Tensor) -> torch.Tensor:
     caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.T)
+    image_loss = contrastive_loss(similarity.t())
     return (caption_loss + image_loss) / 2.0
 
 
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 66340deaf4927f..64fd2f405d5084 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -154,7 +154,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
 # Copied from transformers.models.clip.modeling_clip.clip_loss
 def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
     caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.T)
+    image_loss = contrastive_loss(similarity.t())
     return (caption_loss + image_loss) / 2.0
 
 
diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py
index 8d8b8190e46819..fbfeb47250e73f 100644
--- a/src/transformers/onnx/features.py
+++ b/src/transformers/onnx/features.py
@@ -201,6 +201,10 @@ class FeaturesManager:
             "question-answering",
             onnx_config_cls="models.camembert.CamembertOnnxConfig",
         ),
+        "clip": supported_features_mapping(
+            "default",
+            onnx_config_cls="models.clip.CLIPOnnxConfig",
+        ),
         "codegen": supported_features_mapping(
             "default",
             "causal-lm",
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index 98ab0fad131e01..5634abc7706856 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -185,6 +185,7 @@ def test_values_override(self):
     ("big-bird", "google/bigbird-roberta-base"),
     ("ibert", "kssteven/ibert-roberta-base"),
     ("camembert", "camembert-base"),
+    ("clip", "openai/clip-vit-base-patch32"),
     ("convbert", "YituTech/conv-bert-base"),
     ("codegen", "Salesforce/codegen-350M-multi"),
     ("deberta", "microsoft/deberta-base"),

From fe29e4c046c886c24312847795fb29669a8bd191 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 10 Aug 2022 22:41:58 +0200
Subject: [PATCH 106/162] raise atol for MT5OnnxConfig (#18560)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/mt5/configuration_mt5.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index 3e72831ad25fbc..d9232c94629db2 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -147,9 +147,9 @@ def num_hidden_layers(self):
         return self.num_layers
 
 
-# Copied from transformers.models.t5.configuration_t5.T5OnnxConfig
 class MT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
     @property
+    # Copied from transformers.models.t5.configuration_t5.T5OnnxConfig.inputs
     def inputs(self) -> Mapping[str, Mapping[int, str]]:
         common_inputs = {
             "input_ids": {0: "batch", 1: "encoder_sequence"},
@@ -169,5 +169,10 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]:
         return common_inputs
 
     @property
+    # Copied from transformers.models.t5.configuration_t5.T5OnnxConfig.default_onnx_opset
     def default_onnx_opset(self) -> int:
         return 13
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 5e-4

From 793d978d544647874b4e213f780e5912449e9279 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <mrwyattii@gmail.com>
Date: Wed, 10 Aug 2022 15:28:19 -0700
Subject: [PATCH 107/162] fix string (#18568)

---
 src/transformers/pipelines/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index a0ce06ec5e33f1..7842b95b32859c 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -768,7 +768,7 @@ def __init__(
             elif device < 0:
                 self.device = torch.device("cpu")
             else:
-                self.device = torch.device("cuda:{device}")
+                self.device = torch.device(f"cuda:{device}")
         else:
             self.device = device
         self.binary_output = binary_output

From 8aea3311fe77d5641a70bfb643b60f98ca2f745d Mon Sep 17 00:00:00 2001
From: Maxime G <joihn@users.noreply.github.com>
Date: Thu, 11 Aug 2022 10:59:37 +0200
Subject: [PATCH 108/162] Segformer TF: fix output size in documentation
 (#18572)

* Segformer TF: fix output size in doc

* Segformer pytorch: fix output size in doc

Co-authored-by: Maxime Gardoni <maxime.gardoni@ecorobotix.com>
---
 src/transformers/models/segformer/modeling_segformer.py    | 2 +-
 src/transformers/models/segformer/modeling_tf_segformer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index b8be4cdb70a6d0..4e70eb957acf3f 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -784,7 +784,7 @@ def forward(
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
-        >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
+        >>> logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)
         >>> list(logits.shape)
         [1, 150, 128, 128]
         ```"""
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index c2f4b2ff0c7cd8..2ff256d78d2b1d 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -847,7 +847,7 @@ def call(
 
         >>> inputs = feature_extractor(images=image, return_tensors="tf")
         >>> outputs = model(**inputs, training=False)
-        >>> # logits are of shape (batch_size, num_labels, height, width)
+        >>> # logits are of shape (batch_size, num_labels, height/4, width/4)
         >>> logits = outputs.logits
         >>> list(logits.shape)
         [1, 150, 128, 128]

From db07c44e83f01bcdb7f790c8be1f535ed1ebaa6b Mon Sep 17 00:00:00 2001
From: Alara Dirik <8944735+alaradirik@users.noreply.github.com>
Date: Thu, 11 Aug 2022 15:44:23 +0300
Subject: [PATCH 109/162] Fix resizing bug in OWL-ViT (#18573)

* Fixes resizing bug in OWL-ViT
* Defaults to square resize if size is set to an int
* Sets do_center_crop default value to False
---
 .../models/owlvit/feature_extraction_owlvit.py     | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/owlvit/feature_extraction_owlvit.py b/src/transformers/models/owlvit/feature_extraction_owlvit.py
index 1e4bc735608a35..f8a45706835d8f 100644
--- a/src/transformers/models/owlvit/feature_extraction_owlvit.py
+++ b/src/transformers/models/owlvit/feature_extraction_owlvit.py
@@ -50,13 +50,15 @@ class OwlViTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the shorter edge of the input to a certain `size`.
-        size (`int`, *optional*, defaults to 768):
-            Resize the shorter edge of the input to the given size. Only has an effect if `do_resize` is set to `True`.
+        size (`int` or `Tuple[int, int]`, *optional*, defaults to (768, 768)):
+            The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a
+            sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized
+            to (size, size).
         resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
             An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
             `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
             if `do_resize` is set to `True`.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
+        do_center_crop (`bool`, *optional*, defaults to `False`):
             Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
             image is padded with 0's and then center cropped.
         crop_size (`int`, *optional*, defaults to 768):
@@ -74,10 +76,10 @@ class OwlViTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin
     def __init__(
         self,
         do_resize=True,
-        size=768,
+        size=(768, 768),
         resample=Image.BICUBIC,
         crop_size=768,
-        do_center_crop=True,
+        do_center_crop=False,
         do_normalize=True,
         image_mean=None,
         image_std=None,
@@ -195,7 +197,7 @@ def __call__(
         # transformations (resizing + center cropping + normalization)
         if self.do_resize and self.size is not None and self.resample is not None:
             images = [
-                self.resize(image=image, size=self.size, resample=self.resample, default_to_square=False)
+                self.resize(image=image, size=self.size, resample=self.resample, default_to_square=True)
                 for image in images
             ]
         if self.do_center_crop and self.crop_size is not None:

From 5a29d4f3285f896576cecb03ea2dc540e7ea013d Mon Sep 17 00:00:00 2001
From: "Wonseok Lee (Jack)" <rollerkid02@snu.ac.kr>
Date: Thu, 11 Aug 2022 21:51:39 +0900
Subject: [PATCH 110/162] Fix LayoutLMv3 documentation (#17932)

* fix typos

* fix sequence_length docs of LayoutLMv3Model

* delete trailing white spaces

* fix layoutlmv3 docs more

* apply make fixup & quality

* change to two versions of input docstring

* apply make fixup & quality
---
 .../models/layoutlmv3/modeling_layoutlmv3.py  | 112 ++++++++++++++++--
 1 file changed, 99 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
index f3bdd2cd8d9067..68987e38e9942e 100644
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -54,17 +54,93 @@
     behavior.
 
     Parameters:
-        config ([`LayoutLMv2Config`]): Model configuration class with all the parameters of the model.
+        config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-LAYOUTLMV3_INPUTS_DOCSTRING = r"""
+LAYOUTLMV3_MODEL_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (`torch.LongTensor` of shape `{0}`):
+        input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`LayoutLMv2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
+            Bounding boxes of each input sequence tokens. Selected in the range `[0,
+            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
+            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
+            y1) represents the position of the lower right corner.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size,
+            config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height /
+            config.patch_size) * (width / config.patch_size))`.
+
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
+            token. See `pixel_values` for `patch_sequence_length`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`LayoutLMv3Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -76,16 +152,18 @@
             y1) represents the position of the lower right corner.
 
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Batch of document images.
+            Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size,
+            config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height /
+            config.patch_size) * (width / config.patch_size))`.
 
-        attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*):
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
             1]`:
 
@@ -93,7 +171,7 @@
             - 1 corresponds to a *sentence B* token.
 
             [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `{0}`, *optional*):
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.max_position_embeddings - 1]`.
 
@@ -104,7 +182,7 @@
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
 
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
             model's internal embedding lookup matrix.
@@ -763,7 +841,9 @@ def forward_image(self, pixel_values):
 
         return embeddings
 
-    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_start_docstrings_to_model_forward(
+        LAYOUTLMV3_MODEL_INPUTS_DOCSTRING.format("batch_size, token_sequence_length")
+    )
     @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -975,7 +1055,9 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(
+        LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+    )
     @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1084,7 +1166,9 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(
+        LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+    )
     @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1214,7 +1298,9 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_model_forward(LAYOUTLMV3_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(
+        LAYOUTLMV3_DOWNSTREAM_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+    )
     @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,

From ad4215f95069002723fdc85b1a464d754b58b363 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Thu, 11 Aug 2022 09:33:41 -0400
Subject: [PATCH 111/162] Skip broken tests

---
 tests/models/owlvit/test_modeling_owlvit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index edddc53beeab88..7564d192ad9898 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -733,8 +733,9 @@ def prepare_img():
 
 @require_vision
 @require_torch
+@unittest.skip("These tests are broken, fix me Alara")
 class OwlViTModelIntegrationTest(unittest.TestCase):
-    # @slow
+    @slow
     def test_inference(self):
         model_name = "google/owlvit-base-patch32"
         model = OwlViTModel.from_pretrained(model_name).to(torch_device)

From a272ed0d8894bd9a3921f9cc36ea2c5e8f4b28a7 Mon Sep 17 00:00:00 2001
From: Dan Jones <dan.j.jones2@gmail.com>
Date: Thu, 11 Aug 2022 14:45:04 +0100
Subject: [PATCH 112/162] Change BartLearnedPositionalEmbedding's forward
 method signature to support Opacus training (#18486)

* changing BartLearnedPositionalEmbedding forward signature and references to it

* removing debugging dead code (thanks style checker)

* blackened modeling_bart file

* removing copy inconsistencies via make fix-copies

* changing references to copied signatures in Bart variants

* make fix-copies once more

* using expand over repeat (thanks @michaelbenayoun)

* expand instead of repeat for all model copies

Co-authored-by: Daniel Jones <jonesdaniel@microsoft.com>
---
 src/transformers/models/bart/modeling_bart.py | 26 +++++++++++--------
 .../models/mbart/modeling_mbart.py            | 23 +++++++++-------
 src/transformers/models/mvp/modeling_mvp.py   | 22 ++++++++++------
 .../models/plbart/modeling_plbart.py          | 26 +++++++++++--------
 .../models/trocr/modeling_trocr.py            | 19 +++++++++-----
 5 files changed, 70 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 8411cc6cefefed..525da6f34b06cf 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -128,12 +128,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+
+        bsz, seq_len = input_ids.shape[:2]
         positions = torch.arange(
             past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
+        ).expand(bsz, -1)
+
         return super().forward(positions + self.offset)
 
 
@@ -788,17 +790,17 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
+            input = input_ids
+            input_ids = input_ids.view(-1, input_ids.shape[-1])
         elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
-        embed_pos = self.embed_positions(input_shape)
+        embed_pos = self.embed_positions(input)
 
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
@@ -1015,10 +1017,12 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
+            input = input_ids
+            input_shape = input.shape
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -1026,7 +1030,7 @@ def forward(
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+            inputs_embeds = self.embed_tokens(input) * self.embed_scale
 
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
@@ -1038,7 +1042,7 @@ def forward(
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
         # embed positions
-        positions = self.embed_positions(input_shape, past_key_values_length)
+        positions = self.embed_positions(input, past_key_values_length)
 
         hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 16ea95bc0aedde..66011fe6a73d0a 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -134,12 +134,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+
+        bsz, seq_len = input_ids.shape[:2]
         positions = torch.arange(
             past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
+        ).expand(bsz, -1)
+
         return super().forward(positions + self.offset)
 
 
@@ -783,17 +785,18 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
+            input = input_ids
+            input_shape = input.shape
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
-        embed_pos = self.embed_positions(input_shape)
+        embed_pos = self.embed_positions(input)
 
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
@@ -1013,10 +1016,12 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
+            input = input_ids
+            input_shape = input.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -1036,7 +1041,7 @@ def forward(
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
         # embed positions
-        positions = self.embed_positions(input_shape, past_key_values_length)
+        positions = self.embed_positions(input, past_key_values_length)
 
         hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index d3d239c4cff125..37c1a7d837f7ba 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -134,12 +134,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+
+        bsz, seq_len = input_ids.shape[:2]
         positions = torch.arange(
             past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
+        ).expand(bsz, -1)
+
         return super().forward(positions + self.offset)
 
 
@@ -895,17 +897,19 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
+            input = input_ids
+            input_shape = input.shape
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
-        embed_pos = self.embed_positions(input_shape)
+        embed_pos = self.embed_positions(input)
 
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
@@ -1144,10 +1148,12 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
+            input = input_ids
+            input_shape = input_ids.shape
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -1167,7 +1173,7 @@ def forward(
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
         # embed positions
-        positions = self.embed_positions(input_shape, past_key_values_length)
+        positions = self.embed_positions(input, past_key_values_length)
 
         hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index d03ddf33ebfa7a..d86decb568192e 100755
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -131,12 +131,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+
+        bsz, seq_len = input_ids.shape[:2]
         positions = torch.arange(
             past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
+        ).expand(bsz, -1)
+
         return super().forward(positions + self.offset)
 
 
@@ -759,17 +761,17 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
+            input = input_ids
+            input_ids = input_ids.view(-1, input_ids.shape[-1])
         elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
-        embed_pos = self.embed_positions(input_shape)
+        embed_pos = self.embed_positions(input)
 
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
@@ -987,10 +989,12 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
+            input = input_ids
+            input_shape = input.shape
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -998,7 +1002,7 @@ def forward(
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+            inputs_embeds = self.embed_tokens(input) * self.embed_scale
 
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
@@ -1010,7 +1014,7 @@ def forward(
             encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
 
         # embed positions
-        positions = self.embed_positions(input_shape, past_key_values_length)
+        positions = self.embed_positions(input, past_key_values_length)
 
         hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index a79e5e901d67c4..e25f73c8b7d3b5 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -87,12 +87,14 @@ def __init__(self, num_embeddings: int, embedding_dim: int):
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+
+        bsz, seq_len = input_ids.shape[:2]
         positions = torch.arange(
             past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
+        ).expand(bsz, -1)
+
         return super().forward(positions + self.offset)
 
 
@@ -626,10 +628,11 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
+            input = input_ids
+            input_ids = input_ids.view(-1, input.shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
@@ -640,7 +643,7 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         if self.config.use_learned_position_embeddings:
-            embed_pos = self.embed_positions(input_shape, past_key_values_length=past_key_values_length)
+            embed_pos = self.embed_positions(input, past_key_values_length=past_key_values_length)
         else:
             embed_pos = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
 
@@ -651,6 +654,8 @@ def forward(
 
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        input_shape = input.shape
+
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )

From 6d8ab2774d2de8a3b8339d17b6ee1b940caddf0b Mon Sep 17 00:00:00 2001
From: flozi00 <flozi00.fz@gmail.com>
Date: Thu, 11 Aug 2022 15:52:27 +0200
Subject: [PATCH 113/162] german docs translation (#18544)

* Create _config.py

* Create _toctree.yml

* Create index.mdx

not sure about "du / ihr" oder "sie"

* Create quicktour.mdx

* Update _toctree.yml

* Update build_documentation.yml

* Update build_pr_documentation.yml

* fix build

* Update index.mdx

* Update quicktour.mdx

* Create installation.mdx

* Update _toctree.yml
---
 .github/workflows/build_documentation.yml    |   2 +-
 .github/workflows/build_pr_documentation.yml |   2 +-
 docs/source/de/_config.py                    |  14 +
 docs/source/de/_toctree.yml                  |   8 +
 docs/source/de/index.mdx                     | 322 ++++++++++++++
 docs/source/de/installation.mdx              | 246 +++++++++++
 docs/source/de/quicktour.mdx                 | 428 +++++++++++++++++++
 7 files changed, 1020 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/de/_config.py
 create mode 100644 docs/source/de/_toctree.yml
 create mode 100644 docs/source/de/index.mdx
 create mode 100644 docs/source/de/installation.mdx
 create mode 100644 docs/source/de/quicktour.mdx

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index fb28fe4f2bdf92..d78fd53a041596 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -15,6 +15,6 @@ jobs:
       commit_sha: ${{ github.sha }}
       package: transformers
       notebook_folder: transformers_doc
-      languages: en es it pt
+      languages: de en es it pt
     secrets:
       token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 8a4dc5a06ec87c..efe9965c3be24a 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -14,4 +14,4 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: transformers
-      languages: en es it pt
+      languages: de en es it pt
diff --git a/docs/source/de/_config.py b/docs/source/de/_config.py
new file mode 100644
index 00000000000000..a6d75853f57219
--- /dev/null
+++ b/docs/source/de/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Transformers installation
+! pip install transformers datasets
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+    "{processor_class}": "FakeProcessorClass",
+    "{model_class}": "FakeModelClass",
+    "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/source/de/_toctree.yml b/docs/source/de/_toctree.yml
new file mode 100644
index 00000000000000..6097df8d06ae0b
--- /dev/null
+++ b/docs/source/de/_toctree.yml
@@ -0,0 +1,8 @@
+- sections:
+  - local: index
+    title: 🤗 Transformers
+  - local: quicktour
+    title: Schnellstart
+  - local: installation
+    title: Installation
+  title: Erste Schritte
diff --git a/docs/source/de/index.mdx b/docs/source/de/index.mdx
new file mode 100644
index 00000000000000..815fd1724f4412
--- /dev/null
+++ b/docs/source/de/index.mdx
@@ -0,0 +1,322 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 🤗 Transformers
+
+Maschinelles Lernen auf dem neuesten Stand der Technik für PyTorch, TensorFlow und JAX.
+
+🤗 Transformers bietet APIs zum einfachen Herunterladen und Trainieren von vortrainierten Modellen auf dem neuesten Stand der Technik. Die Verwendung von vortrainierten Modellen kann Rechenkosten sparen und den CO2-Fußabdruck reduzieren und Zeit sparen, die für das Training eines Modells von Grund auf benötigt wird. Die Modelle können für verschiedene Modalitäten verwendet werden, wie z. B.:
+
+* 📝 Text: Textklassifizierung, Informationsextrahierung, Beantwortung von Fragen, Zusammenfassung, Übersetzung und Texterstellung in über 100 Sprachen.
+* 🖼️ Bilder: Bildklassifizierung, Objekterkennung und Segmentierung.
+* 🗣️ Audio: Spracherkennung und Audioklassifizierung.
+* 🐙 Multimodal: Beantwortung von Tabellenfragen, optische Zeichenerkennung, Informationsextraktion aus gescannten Dokumenten, Videoklassifizierung und Beantwortung visueller Fragen.
+
+Unsere Bibliothek unterstützt die nahtlose Integration von drei der beliebtesten Deep-Learning-Bibliotheken: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) und [JAX](https://jax.readthedocs.io/en/latest/). Trainieren Sie Ihr Modell in drei Codezeilen in einem Framework und laden Sie es zur Inferenz mit einem anderen.
+
+Jede 🤗 Transformers-Architektur ist in einem eigenständigen Python-Modul definiert, so dass sie leicht für Forschung und Experimente angepasst werden kann.
+
+## Wenn Sie auf der Suche nach individueller Unterstützung durch das Hugging Face-Team sind
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## Inhalt
+
+Die Dokumentation ist in fünf Teile gegliedert:
+
+- **GET STARTED** enthält eine kurze Tour und Installationsanweisungen, um mit 🤗 Transformers loszulegen.
+- **TUTORIALS** sind ein hervorragender Ausgangspunkt, wenn Sie neu in unserer Bibliothek sind. Dieser Abschnitt hilft Ihnen, die grundlegenden Fähigkeiten zu erlangen, die Sie benötigen, um mit 🤗 Transformers zu arbeiten.
+- **HOW-TO GUIDES** zeigen Ihnen, wie Sie ein bestimmtes Ziel erreichen können, z. B. die Feinabstimmung eines vortrainierten Modells für die Sprachmodellierung oder die Erstellung eines benutzerdefinierten Modellkopfs.
+- **KONZEPTUELLE ANLEITUNGEN** bietet weitere Diskussionen und Erklärungen zu den zugrunde liegenden Konzepten und Ideen hinter Modellen, Aufgaben und der Designphilosophie von 🤗 Transformers. 
+- **API** beschreibt jede Klasse und Funktion, gruppiert in:
+
+  - **MAIN CLASSES** für die Hauptklassen, die die wichtigsten APIs der Bibliothek darstellen.
+  - MODELLE** für die Klassen und Funktionen, die zu jedem in der Bibliothek implementierten Modell gehören.
+  - **INTERNAL HELPERS** für die Klassen und Funktionen, die wir intern verwenden.
+
+Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen, vortrainierte Modellgewichte, Nutzungsskripte und Konvertierungsprogramme für die folgenden Modelle.
+
+### Unterstütze Modelle
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Unterstützte Frameworks
+
+Die folgende Tabelle zeigt die derzeitige Unterstützung in der Bibliothek für jedes dieser Modelle, unabhängig davon, ob sie einen Python
+Tokenizer haben (als "langsam" bezeichnet), ein "schneller" Tokenizer, der von der 🤗 Tokenizers Bibliothek unterstützt wird, ob sie Unterstützung in Jax (via
+Flax), PyTorch, und/oder TensorFlow haben.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|       BigBird-Pegasus       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CvT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            FLAVA            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          GroupViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LeViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           LongT5            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           M-CTC-T           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Megatron-BERT        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileViT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             MT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             MVP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            Nezha            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Nyströmformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             OPT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           OWL-ViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Swin Transformer       |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         VisualBERT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/de/installation.mdx b/docs/source/de/installation.mdx
new file mode 100644
index 00000000000000..3103830ee7fd8a
--- /dev/null
+++ b/docs/source/de/installation.mdx
@@ -0,0 +1,246 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Installation
+
+Installieren Sie 🤗 Transformers für die Deep-Learning-Bibliothek, mit der Sie arbeiten, richten Sie Ihren Cache ein und konfigurieren Sie 🤗 Transformers optional für den Offline-Betrieb.
+
+🤗 Transformers wurde unter Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, und Flax getestet. Folgen Sie den Installationsanweisungen unten für die von Ihnen verwendete Deep-Learning-Bibliothek:
+
+* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
+* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
+
+## Installation mit pip
+
+Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, werfen Sie einen Blick auf diese [Anleitung](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Eine virtuelle Umgebung macht es einfacher, verschiedene Projekte zu verwalten und Kompatibilitätsprobleme zwischen Abhängigkeiten zu vermeiden.
+
+Beginnen wir mit der Erstellung einer virtuellen Umgebung in Ihrem Projektverzeichnis:
+
+
+```bash
+python -m venv .env
+```
+
+Aktivieren wir die virtuelle Umgebung. Unter Linux und MacOs:
+
+```bash
+source .env/bin/activate
+```
+Aktivieren wir die virtuelle Umgebung unter Windows
+
+```bash
+.env/Scripts/activate
+```
+
+Jetzt können wir die 🤗 Transformers mit dem folgenden Befehl installieren:
+
+```bash
+pip install transformers
+```
+
+Bei reiner CPU-Unterstützung können wir 🤗 Transformers und eine Deep-Learning-Bibliothek bequem in einer Zeile installieren. Installieren wir zum Beispiel 🤗 Transformers und PyTorch mit:
+
+```bash
+pip install transformers[torch]
+```
+
+🤗 Transformers und TensorFlow 2.0:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+🤗 Transformers und Flax:
+
+```bash
+pip install transformers[flax]
+```
+
+Überprüfen wir abschließend, ob 🤗 Transformers ordnungsgemäß installiert wurde, indem wir den folgenden Befehl ausführen. Es wird ein vortrainiertes Modell heruntergeladen:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+Dann wird die Kategorie und die Wahrscheinlichkeit ausgegeben:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## Installation aus dem Code
+
+Installieren wir 🤗 Transformers aus dem Quellcode mit dem folgenden Befehl:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+Dieser Befehl installiert die aktuelle `main` Version und nicht die neueste `stable` Version. Die `main`-Version ist nützlich, um mit den neuesten Entwicklungen Schritt zu halten. Zum Beispiel, wenn ein Fehler seit der letzten offiziellen Version behoben wurde, aber eine neue Version noch nicht veröffentlicht wurde. Das bedeutet jedoch, dass die "Hauptversion" nicht immer stabil ist. Wir bemühen uns, die Hauptversion einsatzbereit zu halten, und die meisten Probleme werden normalerweise innerhalb weniger Stunden oder eines Tages behoben. Wenn Sie auf ein Problem stoßen, öffnen Sie bitte ein [Issue] (https://github.com/huggingface/transformers/issues), damit wir es noch schneller beheben können!
+
+Überprüfen wir, ob 🤗 Transformers richtig installiert wurde, indem Sie den folgenden Befehl ausführen:
+
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## Editierbare Installation
+
+Sie benötigen eine bearbeitbare Installation, wenn Sie:
+
+* die "Haupt"-Version des Quellcodes verwenden möchten.
+* Zu 🤗 Transformers beitragen und Änderungen am Code testen wollen.
+
+Klonen Sie das Repository und installieren 🤗 Transformers mit den folgenden Befehlen:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+Diese Befehle verknüpfen den Ordner, in den Sie das Repository geklont haben, mit den Pfaden Ihrer Python-Bibliotheken. Python wird nun in dem Ordner suchen, in den Sie geklont haben, zusätzlich zu den normalen Bibliothekspfaden. Wenn zum Beispiel Ihre Python-Pakete normalerweise in `~/anaconda3/envs/main/lib/python3.7/site-packages/` installiert sind, wird Python auch den Ordner durchsuchen, in den Sie geklont haben: `~/transformers/`.
+
+
+<Tip warning={true}>
+
+Sie müssen den Ordner `transformers` behalten, wenn Sie die Bibliothek weiter verwenden wollen.
+
+</Tip>
+
+Jetzt können Sie Ihren Klon mit dem folgenden Befehl ganz einfach auf die neueste Version von 🤗 Transformers aktualisieren:
+
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+Ihre Python-Umgebung wird beim nächsten Ausführen die `main`-Version von 🤗 Transformers finden.
+
+## Installation mit conda
+
+Installation von dem conda Kanal `huggingface`:
+
+```bash
+conda install -c huggingface transformers
+```
+
+## Cache Einrichtung
+
+Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben:
+
+1. Shell-Umgebungsvariable (Standard): `HUGGINGFACE_HUB_CACHE` oder `TRANSFORMERS_CACHE`.
+2. Shell-Umgebungsvariable: `HF_HOME`.
+3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`.
+
+
+<Tip>
+
+Transformers verwendet die Shell-Umgebungsvariablen `PYTORCH_TRANSFORMERS_CACHE` oder `PYTORCH_PRETRAINED_BERT_CACHE`, wenn Sie von einer früheren Iteration dieser Bibliothek kommen und diese Umgebungsvariablen gesetzt haben, sofern Sie nicht die Shell-Umgebungsvariable `TRANSFORMERS_CACHE` angeben.
+  
+</Tip>
+
+## Offline Modus
+
+Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `TRANSFORMERS_OFFLINE=1`, um dieses Verhalten zu aktivieren.
+
+<Tip>
+
+Fügen sie [🤗 Datasets](https://huggingface.co/docs/datasets/) zu Ihrem Offline-Trainingsworkflow hinzufügen, indem Sie die Umgebungsvariable `HF_DATASETS_OFFLINE=1` setzen.
+
+</Tip>
+
+So würden Sie beispielsweise ein Programm in einem normalen Netzwerk mit einer Firewall für externe Instanzen mit dem folgenden Befehl ausführen:
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Führen Sie das gleiche Programm in einer Offline-Instanz mit aus:
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Das Skript sollte nun laufen, ohne sich aufzuhängen oder eine Zeitüberschreitung abzuwarten, da es weiß, dass es nur nach lokalen Dateien suchen soll.
+
+
+### Abrufen von Modellen und Tokenizern zur Offline-Verwendung
+
+Eine andere Möglichkeit, 🤗 Transformers offline zu verwenden, besteht darin, die Dateien im Voraus herunterzuladen und dann auf ihren lokalen Pfad zu verweisen, wenn Sie sie offline verwenden müssen. Es gibt drei Möglichkeiten, dies zu tun:
+
+* Laden Sie eine Datei über die Benutzeroberfläche des [Model Hub](https://huggingface.co/models) herunter, indem Sie auf das ↓-Symbol klicken.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* Verwenden Sie den [PreTrainedModel.from_pretrained] und [PreTrainedModel.save_pretrained] Workflow:
+
+    1. Laden Sie Ihre Dateien im Voraus mit [`PreTrainedModel.from_pretrained`] herunter:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. Speichern Sie Ihre Dateien in einem bestimmten Verzeichnis mit [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. Wenn Sie nun offline sind, laden Sie Ihre Dateien mit [`PreTrainedModel.from_pretrained`] aus dem bestimmten Verzeichnis:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* Programmatisches Herunterladen von Dateien mit der [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) Bibliothek:
+
+    1. Installieren Sie die "huggingface_hub"-Bibliothek in Ihrer virtuellen Umgebung:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. Verwenden Sie die Funktion [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub), um eine Datei in einen bestimmten Pfad herunterzuladen. Der folgende Befehl lädt zum Beispiel die Datei "config.json" aus dem Modell [T0](https://huggingface.co/bigscience/T0_3B) in den gewünschten Pfad herunter:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+Sobald Ihre Datei heruntergeladen und lokal zwischengespeichert ist, geben Sie den lokalen Pfad an, um sie zu laden und zu verwenden:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Weitere Informationen zum Herunterladen von Dateien, die auf dem Hub gespeichert sind, finden Sie im Abschnitt [Wie man Dateien vom Hub herunterlädt] (https://huggingface.co/docs/hub/how-to-downstream).
+  
+</Tip>
diff --git a/docs/source/de/quicktour.mdx b/docs/source/de/quicktour.mdx
new file mode 100644
index 00000000000000..4c668bf419b134
--- /dev/null
+++ b/docs/source/de/quicktour.mdx
@@ -0,0 +1,428 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Schnellstart
+
+[[open-in-colab]]
+
+Mit 🤗 Transformers können Sie sofort loslegen! Verwenden Sie die [`pipeline`] für schnelle Inferenz und laden Sie schnell ein vortrainiertes Modell und einen Tokenizer mit einer [AutoClass](./model_doc/auto), um Ihre Text-, Bild- oder Audioaufgabe zu lösen.
+
+<Tip>
+
+Alle in der Dokumentation vorgestellten Codebeispiele haben oben links einen Umschalter für PyTorch und TensorFlow. Wenn
+nicht, wird erwartet, dass der Code für beide Backends ohne Änderungen funktioniert.
+
+</Tip>
+
+## Pipeline
+
+[`pipeline`] ist der einfachste Weg, ein vortrainiertes Modell für eine bestimmte Aufgabe zu verwenden.
+
+<Youtube id="tiZFewofSLM"/>
+
+Die [`pipeline`] unterstützt viele gängige Aufgaben:
+
+**Text**:
+* Stimmungsanalyse: Klassifizierung der Polarität eines gegebenen Textes.
+* Textgenerierung (auf Englisch): Generierung von Text aus einer gegebenen Eingabe.
+* Name-Entity-Recognition (NER): Kennzeichnung jedes Worts mit der Entität, die es repräsentiert (Person, Datum, Ort usw.).
+* Beantwortung von Fragen: Extrahieren der Antwort aus dem Kontext, wenn ein gewisser Kontext und eine Frage gegeben sind.
+* Fill-mask: Ausfüllen von Lücken in einem Text mit maskierten Wörtern.
+* Zusammenfassung: Erstellung einer Zusammenfassung einer langen Text- oder Dokumentensequenz.
+* Übersetzung: Übersetzen eines Textes in eine andere Sprache.
+* Merkmalsextraktion: Erstellen einer Tensordarstellung des Textes.
+
+**Bild**:
+* Bildklassifizierung: Klassifizierung eines Bildes.
+* Bildsegmentierung: Klassifizierung jedes Pixels in einem Bild.
+* Objekterkennung: Erkennen von Objekten innerhalb eines Bildes.
+
+**Audio**:
+* Audioklassifizierung: Zuweisung eines Labels zu einem bestimmten Audiosegment.
+* Automatische Spracherkennung (ASR): Transkription von Audiodaten in Text.
+
+<Tip>
+
+Für mehr Details über die [`pipeline`] und assoziierte Aufgaben, schauen Sie in die Dokumentation [hier](./main_classes/pipelines).
+
+</Tip>
+
+### Verwendung der Pipeline
+
+Im folgenden Beispiel werden Sie die [`pipeline`] für die Stimmungsanalyse verwenden.
+
+Installieren Sie die folgenden Abhängigkeiten, falls Sie dies nicht bereits getan haben:
+
+<frameworkcontent>
+<pt>
+```bash
+pip install torch
+```
+</pt>
+<tf>
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+Importieren sie die [`pipeline`] und spezifizieren sie die Aufgabe, welche sie lösen möchten:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+Die Pipeline lädt ein standardmäßiges [vortrainiertes Modell] (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) und einen Tokenizer für die Stimmungs-Analyse herunter und speichert sie. Jetzt können Sie den "Klassifikator" auf Ihren Zieltext anwenden:
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+For more than one sentence, pass a list of sentences to the [`pipeline`] which returns a list of dictionaries:
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+
+Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek:
+
+```bash
+pip install datasets 
+```
+
+Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten.
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+Als nächstes laden wir den Datensatz (siehe 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) für mehr Details) welches wir nutzen möchten. Zum Beispiel laden wir den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+Wir müssen sicherstellen, dass die Abtastrate des Datensatzes der Abtastrate entspricht, mit der `facebook/wav2vec2-base-960h` trainiert wurde.
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+Audiodateien werden automatisch geladen und neu abgetastet, wenn die Spalte "audio" aufgerufen wird.
+Extrahieren wir die rohen Wellenform-Arrays der ersten 4 Beispiele und übergeben wir sie als Liste an die Pipeline:
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
+```
+
+Bei einem größeren Datensatz mit vielen Eingaben (wie bei Sprache oder Bildverarbeitung) sollten Sie einen Generator anstelle einer Liste übergeben, der alle Eingaben in den Speicher lädt. Weitere Informationen finden Sie in der [Pipeline-Dokumentation](./main_classes/pipelines).
+
+### Ein anderes Modell und einen anderen Tokenizer in der Pipeline verwenden
+
+Die [`pipeline`] kann jedes Modell aus dem [Model Hub] (https://huggingface.co/models) verwenden, wodurch es einfach ist, die [`pipeline`] für andere Anwendungsfälle anzupassen. Wenn Sie beispielsweise ein Modell wünschen, das französischen Text verarbeiten kann, verwenden Sie die Tags im Model Hub, um nach einem geeigneten Modell zu filtern. Das oberste gefilterte Ergebnis liefert ein mehrsprachiges [BERT-Modell](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment), das auf die Stimmungsanalyse abgestimmt ist. Großartig, verwenden wir dieses Modell!
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+Dann können Sie das Modell und den Tokenizer in der [`pipeline`] angeben und den `Klassifikator` auf Ihren Zieltext anwenden:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein vortrainiertes Modell auf Ihren Daten feinabstimmen. Schauen Sie sich unser [Feinabstimmungs-Tutorial](./training) an, um zu erfahren, wie das geht. Und schließlich, nachdem Sie Ihr trainiertes Modell verfeinert haben, sollten Sie es mit der Community im Model Hub teilen (siehe Tutorial [hier](./model_sharing)), um NLP für alle zu demokratisieren! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. 
+
+Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren.
+
+### AutoTokenizer
+
+Ein Tokenizer ist für die Vorverarbeitung von Text in ein für das Modell verständliches Format zuständig. Zunächst zerlegt der Tokenisierer den Text in Wörter, die *Token* genannt werden. Es gibt mehrere Regeln für den Tokenisierungsprozess, z. B. wie und auf welcher Ebene ein Wort aufgespalten wird (weitere Informationen über Tokenisierung [hier](./tokenizer_summary)). Das Wichtigste ist jedoch, dass Sie den Tokenizer mit demselben Modellnamen instanziieren müssen, um sicherzustellen, dass Sie dieselben Tokenisierungsregeln verwenden, mit denen ein Modell zuvor trainiert wurde.
+Laden sie einen Tokenizer mit [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als Eingabe für das Modell zu konstruieren. Dieser wird als *Vokabular* des Modells bezeichnet.
+
+Übergeben Sie Ihren Text an den Tokenizer:
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:
+
+* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
+* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
+
+Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:
+
+<frameworkcontent>
+<pt>
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+Lesen Sie das Tutorial [preprocessing](./preprocessing) für weitere Details zur Tokenisierung.
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers bietet eine einfache und einheitliche Möglichkeit, vortrainierte Instanzen zu laden. Das bedeutet, dass Sie ein [`AutoModel`] laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`AutoModel`] für die Aufgabe. Da Sie eine Text- oder Sequenzklassifizierung vornehmen, laden Sie [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist.
+
+</Tip>
+
+Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben. Sie müssen nur das Wörterbuch entpacken, indem Sie `**` hinzufügen:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
+  
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers bietet eine einfache und einheitliche Methode zum Laden von vortrainierten Instanzen. Das bedeutet, dass Sie ein [`TFAutoModel`] genauso laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`TFAutoModel`] für die Aufgabe. Da Sie Text - oder Sequenz - Klassifizierung machen, laden Sie [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist.
+
+</Tip>
+
+Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben:
+  
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Alle 🤗 Transformers-Modelle (PyTorch oder TensorFlow) geben die Tensoren *vor* der endgültigen Aktivierungsfunktion
+Funktion (wie Softmax) aus, da die endgültige Aktivierungsfunktion oft mit dem Verlusten verschmolzen ist.
+
+</Tip>
+
+Modelle sind ein standardmäßiges [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) oder ein [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model), sodass Sie sie in Ihrer üblichen Trainingsschleife verwenden können. Um jedoch die Dinge einfacher zu machen, bietet 🤗 Transformers eine [`Trainer`]-Klasse für PyTorch, die Funktionalität für verteiltes Training, gemischte Präzision und mehr bietet. Für TensorFlow können Sie die Methode `fit` aus [Keras](https://keras.io/) verwenden. Siehe das [training tutorial](./training) für weitere Details.
+
+<Tip>
+
+Transformers-Modellausgaben sind spezielle Datenklassen, so dass ihre Attribute in einer IDE automatisch vervollständigt werden.
+Die Modellausgänge verhalten sich auch wie ein Tupel oder ein Wörterbuch (z.B. können Sie mit einem Integer, einem Slice oder einem String indexieren), wobei die Attribute, die "None" sind, ignoriert werden.
+
+</Tip>
+
+### Modell speichern
+
+<frameworkcontent>
+<pt>
+Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer speichern, indem Sie [`PreTrainedModel.save_pretrained`] verwenden:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+Wenn Sie bereit sind, das Modell erneut zu verwenden, laden Sie es mit [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer unter Verwendung von [`TFPreTrainedModel.save_pretrained`] speichern:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+Wenn Sie bereit sind, das Modell wieder zu verwenden, laden Sie es mit [`TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell zu speichern und es entweder als PyTorch- oder TensorFlow-Modell wieder zu laden. Der Parameter "from_pt" oder "from_tf" kann das Modell von einem Framework in das andere konvertieren:
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
+
+## Custom model builds
+
+Sie können die Konfigurationsklasse des Modells ändern, um zu bestimmen, wie ein Modell aufgebaut ist. Die Konfiguration legt die Attribute eines Modells fest, z. B. die Anzahl der verborgenen Schichten oder der Aufmerksamkeitsköpfe. Wenn Sie ein Modell aus einer benutzerdefinierten Konfigurationsklasse initialisieren, beginnen Sie bei Null. Die Modellattribute werden zufällig initialisiert, und Sie müssen das Modell trainieren, bevor Sie es verwenden können, um aussagekräftige Ergebnisse zu erhalten.
+
+Beginnen Sie mit dem Import von [`AutoConfig`] und laden Sie dann das trainierte Modell, das Sie ändern möchten. Innerhalb von [`AutoConfig.from_pretrained`] können Sie das Attribut angeben, das Sie ändern möchten, z. B. die Anzahl der Aufmerksamkeitsköpfe:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+Create a model from your custom configuration with [`AutoModel.from_config`]:
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+Create a model from your custom configuration with [`TFAutoModel.from_config`]:
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+Weitere Informationen zur Erstellung von benutzerdefinierten Konfigurationen finden Sie in der Anleitung [Erstellen einer benutzerdefinierten Architektur](./create_a_model).
+
+## Wie geht es weiter?
+
+Nachdem Sie nun die 🤗 Transformers-Kurztour abgeschlossen haben, schauen Sie sich unsere Anleitungen an und erfahren Sie, wie Sie spezifischere Dinge tun können, wie das Schreiben eines benutzerdefinierten Modells, die Feinabstimmung eines Modells für eine Aufgabe und wie man ein Modell mit einem Skript trainiert. Wenn Sie mehr über die Kernkonzepte von 🤗 Transformers erfahren möchten, nehmen Sie sich eine Tasse Kaffee und werfen Sie einen Blick auf unsere konzeptionellen Leitfäden!

From 9d87c2de4bfd710e21ee9cbd11c3633820471aaf Mon Sep 17 00:00:00 2001
From: iiLaurens <iiLaurens@users.noreply.github.com>
Date: Thu, 11 Aug 2022 15:54:43 +0200
Subject: [PATCH 114/162] Deberta V2: Fix critical trace warnings to allow ONNX
 export (#18272)

* Fix critical trace warnings to allow ONNX export

* Force input to `sqrt` to be float type

* Cleanup code

* Remove unused import statement

* Update model sew

* Small refactor

Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>

* Use broadcasting instead of repeat

* Implement suggestion

Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>

* Match deberta v2 changes in sew_d

* Improve code quality

* Update code quality

* Consistency of small refactor

* Match changes in sew_d

Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>
---
 .../models/deberta_v2/modeling_deberta_v2.py  | 30 +++++++++++--------
 .../models/sew_d/modeling_sew_d.py            | 28 ++++++++++-------
 2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index a513a8280ed51d..3243ee108d488b 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -14,11 +14,9 @@
 # limitations under the License.
 """ PyTorch DeBERTa-v2 model."""
 
-import math
 from collections.abc import Sequence
 from typing import Optional, Tuple, Union
 
-import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
@@ -552,11 +550,17 @@ def custom_forward(*inputs):
 
 
 def make_log_bucket_position(relative_pos, bucket_size, max_position):
-    sign = np.sign(relative_pos)
+    sign = torch.sign(relative_pos)
     mid = bucket_size // 2
-    abs_pos = np.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, np.abs(relative_pos))
-    log_pos = np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1)) + mid
-    bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int)
+    abs_pos = torch.where(
+        (relative_pos < mid) & (relative_pos > -mid),
+        torch.tensor(mid - 1).type_as(relative_pos),
+        torch.abs(relative_pos),
+    )
+    log_pos = (
+        torch.ceil(torch.log(abs_pos / mid) / torch.log(torch.tensor((max_position - 1) / mid)) * (mid - 1)) + mid
+    )
+    bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos), log_pos * sign)
     return bucket_pos
 
 
@@ -578,12 +582,12 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
         `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
 
     """
-    q_ids = np.arange(0, query_size)
-    k_ids = np.arange(0, key_size)
-    rel_pos_ids = q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0], 1))
+    q_ids = torch.arange(0, query_size)
+    k_ids = torch.arange(0, key_size)
+    rel_pos_ids = q_ids[:, None] - k_ids[None, :]
     if bucket_size > 0 and max_position > 0:
         rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
-    rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long)
+    rel_pos_ids = rel_pos_ids.to(torch.long)
     rel_pos_ids = rel_pos_ids[:query_size, :]
     rel_pos_ids = rel_pos_ids.unsqueeze(0)
     return rel_pos_ids
@@ -712,7 +716,7 @@ def forward(
             scale_factor += 1
         if "p2c" in self.pos_att_type:
             scale_factor += 1
-        scale = math.sqrt(query_layer.size(-1) * scale_factor)
+        scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
         attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
@@ -787,7 +791,7 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_
         score = 0
         # content->position
         if "c2p" in self.pos_att_type:
-            scale = math.sqrt(pos_key_layer.size(-1) * scale_factor)
+            scale = torch.sqrt(torch.tensor(pos_key_layer.size(-1), dtype=torch.float) * scale_factor)
             c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
             c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
             c2p_att = torch.gather(
@@ -799,7 +803,7 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_
 
         # position->content
         if "p2c" in self.pos_att_type:
-            scale = math.sqrt(pos_query_layer.size(-1) * scale_factor)
+            scale = torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor)
             if key_layer.size(-2) != query_layer.size(-2):
                 r_pos = build_relative_position(
                     key_layer.size(-2),
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index a9a231aec1d8e6..fe5836a80f36e0 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -194,11 +194,17 @@ def compute_num_masked_span(input_length):
 
 # Copied from transformers.models.deberta_v2.modeling_deberta_v2.make_log_bucket_position
 def make_log_bucket_position(relative_pos, bucket_size, max_position):
-    sign = np.sign(relative_pos)
+    sign = torch.sign(relative_pos)
     mid = bucket_size // 2
-    abs_pos = np.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, np.abs(relative_pos))
-    log_pos = np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1)) + mid
-    bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int)
+    abs_pos = torch.where(
+        (relative_pos < mid) & (relative_pos > -mid),
+        torch.tensor(mid - 1).type_as(relative_pos),
+        torch.abs(relative_pos),
+    )
+    log_pos = (
+        torch.ceil(torch.log(abs_pos / mid) / torch.log(torch.tensor((max_position - 1) / mid)) * (mid - 1)) + mid
+    )
+    bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos), log_pos * sign)
     return bucket_pos
 
 
@@ -221,12 +227,12 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
         `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
 
     """
-    q_ids = np.arange(0, query_size)
-    k_ids = np.arange(0, key_size)
-    rel_pos_ids = q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0], 1))
+    q_ids = torch.arange(0, query_size)
+    k_ids = torch.arange(0, key_size)
+    rel_pos_ids = q_ids[:, None] - k_ids[None, :]
     if bucket_size > 0 and max_position > 0:
         rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
-    rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long)
+    rel_pos_ids = rel_pos_ids.to(torch.long)
     rel_pos_ids = rel_pos_ids[:query_size, :]
     rel_pos_ids = rel_pos_ids.unsqueeze(0)
     return rel_pos_ids
@@ -784,7 +790,7 @@ def forward(
             scale_factor += 1
         if "p2c" in self.pos_att_type:
             scale_factor += 1
-        scale = math.sqrt(query_layer.size(-1) * scale_factor)
+        scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
         attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
@@ -859,7 +865,7 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_
         score = 0
         # content->position
         if "c2p" in self.pos_att_type:
-            scale = math.sqrt(pos_key_layer.size(-1) * scale_factor)
+            scale = torch.sqrt(torch.tensor(pos_key_layer.size(-1), dtype=torch.float) * scale_factor)
             c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
             c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
             c2p_att = torch.gather(
@@ -871,7 +877,7 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_
 
         # position->content
         if "p2c" in self.pos_att_type:
-            scale = math.sqrt(pos_query_layer.size(-1) * scale_factor)
+            scale = torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor)
             if key_layer.size(-2) != query_layer.size(-2):
                 r_pos = build_relative_position(
                     key_layer.size(-2),

From 1c38f1ac1e81349d72ab65f6b2b8ececbcb957ea Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Thu, 11 Aug 2022 16:34:44 +0200
Subject: [PATCH 115/162] [FX] _generate_dummy_input supports
 audio-classification models for labels (#18580)

* Support audio classification architectures for labels generation, as well as provides a flag to print warnings or not

* Use ENV_VARS_TRUE_VALUES
---
 src/transformers/utils/fx.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 2198928eadb325..990f278b0d5066 100644
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -19,6 +19,7 @@
 import inspect
 import math
 import operator
+import os
 import random
 import warnings
 from typing import Any, Callable, Dict, List, Optional, Type, Union
@@ -48,11 +49,12 @@
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
     MODEL_MAPPING_NAMES,
 )
-from ..utils import TORCH_FX_REQUIRED_VERSION, is_torch_fx_available
+from ..utils import ENV_VARS_TRUE_VALUES, TORCH_FX_REQUIRED_VERSION, is_torch_fx_available
 from ..utils.versions import importlib_metadata
 
 
 logger = logging.get_logger(__name__)
+_IS_IN_DEBUG_MODE = os.environ.get("FX_DEBUG_MODE", "").upper() in ENV_VARS_TRUE_VALUES
 
 
 def _generate_supported_model_class_names(
@@ -678,7 +680,12 @@ def _generate_dummy_input(
         if input_name in ["labels", "start_positions", "end_positions"]:
 
             batch_size = shape[0]
-            if model_class_name in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
+            if model_class_name in [
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES),
+            ]:
                 inputs_dict["labels"] = torch.zeros(batch_size, dtype=torch.long, device=device)
             elif model_class_name in [
                 *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
@@ -710,11 +717,6 @@ def _generate_dummy_input(
                     )
                 inputs_dict["labels"] = torch.zeros(*labels_shape, dtype=labels_dtype, device=device)
 
-            elif model_class_name in [
-                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
-            ]:
-                inputs_dict["labels"] = torch.zeros(batch_size, dtype=torch.long, device=device)
             elif model_class_name in [
                 *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES),
                 *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
@@ -725,7 +727,9 @@ def _generate_dummy_input(
             ]:
                 inputs_dict["labels"] = torch.zeros(shape, dtype=torch.long, device=device)
             else:
-                raise NotImplementedError(f"{model_class_name} not supported yet.")
+                raise NotImplementedError(
+                    f"Generating the dummy input named {input_name} for {model_class_name} is not supported yet."
+                )
         elif "pixel_values" in input_name:
             batch_size = shape[0]
             image_size = getattr(model.config, "image_size", None)
@@ -846,7 +850,8 @@ def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, pr
                 raise ValueError("Don't support composite output yet")
             rv.install_metadata(meta_out)
         except Exception as e:
-            warnings.warn(f"Could not compute metadata for {kind} target {target}: {e}")
+            if _IS_IN_DEBUG_MODE:
+                warnings.warn(f"Could not compute metadata for {kind} target {target}: {e}")
 
         return rv
 

From 529ac2bb7b2417c595f54c64ee398125fabbd135 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 11 Aug 2022 10:35:47 -0400
Subject: [PATCH 116/162] Fix docstrings with last version of hf-doc-builder
 styler (#18581)

* Fix docstrings with last version of hf-doc-builder styler

* Remove empty Parameter block
---
 src/transformers/benchmark/benchmark_utils.py             | 5 -----
 src/transformers/generation_flax_utils.py                 | 1 -
 src/transformers/generation_tf_utils.py                   | 5 -----
 src/transformers/generation_utils.py                      | 5 -----
 src/transformers/modelcard.py                             | 2 --
 src/transformers/models/auto/auto_factory.py              | 1 -
 src/transformers/models/flaubert/tokenization_flaubert.py | 1 -
 src/transformers/models/fsmt/tokenization_fsmt.py         | 1 -
 src/transformers/models/perceiver/modeling_perceiver.py   | 1 -
 src/transformers/models/tapex/tokenization_tapex.py       | 1 -
 src/transformers/models/transfo_xl/modeling_transfo_xl.py | 1 -
 src/transformers/models/xlm/tokenization_xlm.py           | 1 -
 src/transformers/testing_utils.py                         | 1 -
 src/transformers/trainer_pt_utils.py                      | 1 -
 src/transformers/trainer_utils.py                         | 1 -
 src/transformers/utils/notebook.py                        | 2 --
 16 files changed, 30 deletions(-)

diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index 36fe5eb116cbef..79740805807185 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -79,7 +79,6 @@ def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: b
     measurements it is important that the function is executed in a separate process
 
     Args:
-
         - `func`: (`callable`): function() -> ... generic function which will be executed in its own separate process
         - `do_multi_processing`: (`bool`) Whether to run function on separate process or not
     """
@@ -210,7 +209,6 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
     https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
 
     Args:
-
         - `function`: (`callable`): function() -> ... function without any arguments to measure for which to measure
           the peak memory
 
@@ -228,7 +226,6 @@ def get_cpu_memory(process_id: int) -> int:
         measures current cpu memory usage of a given `process_id`
 
         Args:
-
             - `process_id`: (`int`) process_id for which to measure memory
 
         Returns
@@ -336,7 +333,6 @@ def start_memory_tracing(
     https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
 
     Args:
-
         - `modules_to_trace`: (None, string, list/tuple of string) if None, all events are recorded if string or list
           of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or
           'transformers.models.gpt2.modeling_gpt2')
@@ -483,7 +479,6 @@ def stop_memory_tracing(
     Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
 
     Args:
-
         `memory_trace` (optional output of start_memory_tracing, default: None):
             memory trace to convert in summary
         `ignore_released_memory` (boolean, default: None):
diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 2f80c7fcf27e96..fd26a605c48bac 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -208,7 +208,6 @@ def generate(
         post](https://huggingface.co/blog/how-to-generate).
 
         Parameters:
-
             input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index a3d26b789c646e..6c8da54835ac92 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -418,7 +418,6 @@ def generate(
         post](https://huggingface.co/blog/how-to-generate).
 
         Parameters:
-
             input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, `(batch_size, sequence_length,
             feature_dim)` or `(batch_size, num_channels, height, width)`, *optional*):
                 The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
@@ -1336,7 +1335,6 @@ def _generate(
         post](https://huggingface.co/blog/how-to-generate).
 
         Parameters:
-
             input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
                 The sequence used as a prompt for the generation. If `None` the method initializes it with
                 `bos_token_id` and a batch size of 1.
@@ -2070,7 +2068,6 @@ def greedy_search(
         Generates sequences for models with a language modeling head using greedy decoding.
 
         Parameters:
-
             input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             logits_processor (`TFLogitsProcessorList`, *optional*):
@@ -2323,7 +2320,6 @@ def sample(
         Generates sequences for models with a language modeling head using multinomial sampling.
 
         Parameters:
-
             input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             logits_processor (`TFLogitsProcessorList`, *optional*):
@@ -2600,7 +2596,6 @@ def beam_search(
         Generates sequences for models with a language modeling head using beam search with multinomial sampling.
 
         Parameters:
-
             input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             max_length (`int`, *optional*, defaults to 20):
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index bb9330de37f0cf..8f6dc6a383a774 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1555,7 +1555,6 @@ def greedy_search(
         used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         Parameters:
-
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             logits_processor (`LogitsProcessorList`, *optional*):
@@ -1789,7 +1788,6 @@ def sample(
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         Parameters:
-
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             logits_processor (`LogitsProcessorList`, *optional*):
@@ -2046,7 +2044,6 @@ def beam_search(
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         Parameters:
-
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             beam_scorer (`BeamScorer`):
@@ -2355,7 +2352,6 @@ def beam_sample(
         sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         Parameters:
-
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             beam_scorer (`BeamScorer`):
@@ -2672,7 +2668,6 @@ def group_beam_search(
         decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         Parameters:
-
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             beam_scorer (`BeamScorer`):
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index dc842c2abbf72c..6743c5624eaf3c 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -80,8 +80,6 @@ class ModelCard:
     Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards. Link: https://arxiv.org/abs/1810.03993
 
     Note: A model card can be loaded and saved to disk.
-
-    Parameters:
     """
 
     def __init__(self, **kwargs):
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index b412f14157f1c3..8d3fabda4706eb 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -563,7 +563,6 @@ class _LazyAutoMapping(OrderedDict):
     " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed.
 
     Args:
-
         - config_mapping: The map model type to config class
         - model_mapping: The map model type to model (or tokenizer) class
     """
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index 5d5ad2a657d1bc..911ef37dac5046 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -130,7 +130,6 @@ def _tokenize(self, text, bypass_tokenizer=False):
             - Install with `pip install sacremoses`
 
         Args:
-
             - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
               (bool). If True, we only apply BPE.
 
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 34272e53cf0fcb..66d9819785483c 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -354,7 +354,6 @@ def _tokenize(self, text, lang="en", bypass_tokenizer=False):
             - Install with `pip install sacremoses`
 
         Args:
-
             - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
               languages. However, we don't enforce it.
             - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index b3a0beea3d3ca4..d069182f06c3c7 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -1960,7 +1960,6 @@ def build_position_encoding(
     Builds the position encoding.
 
     Args:
-
     - out_channels: refers to the number of channels of the position encodings.
     - project_pos_dim: if specified, will project the position encodings to this dimension.
 
diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py
index 7c0725ffe7c108..555bf9fd2c6b9a 100644
--- a/src/transformers/models/tapex/tokenization_tapex.py
+++ b/src/transformers/models/tapex/tokenization_tapex.py
@@ -1398,7 +1398,6 @@ def truncate_table_rows(
     ):
         """
         Args:
-
         table_content:
             {"header": xxx, "rows": xxx, "id" (Optionally): xxx}
 
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
index 75793466c7a8d1..257c45af03bbc0 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -523,7 +523,6 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: O
         weights embeddings afterwards if the model class has a *tie_weights()* method.
 
         Arguments:
-
             new_num_tokens: (*optional*) int:
                 New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at
                 the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index bd7b58eb053b0e..8bb021c5b96987 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -791,7 +791,6 @@ def _tokenize(self, text, lang="en", bypass_tokenizer=False):
         externally, and set `bypass_tokenizer=True` to bypass the tokenizer.
 
         Args:
-
             - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
               languages. However, we don't enforce it.
             - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index d21f353a60a8f5..2e99a76232c27c 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1286,7 +1286,6 @@ def pytest_terminal_summary_main(tr, id):
     there.
 
     Args:
-
     - tr: `terminalreporter` passed from `conftest.py`
     - id: unique id like `tests` or `examples` that will be incorporated into the final reports filenames - this is
       needed as some jobs have multiple runs of pytest, so we can't have them overwrite each other.
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index e1ad471b07a9e0..57103b50d5a039 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -377,7 +377,6 @@ class DistributedTensorGatherer:
     For some reason, that's not going to roll their boat. This class is there to solve that problem.
 
     Args:
-
         world_size (`int`):
             The number of processes used in the distributed training.
         num_samples (`int`):
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 579e5d1dc24ce4..a298fc1de5719e 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -337,7 +337,6 @@ def speed_metrics(split, start_time, num_samples=None, num_steps=None):
     should be run immediately after the operation to be measured has completed.
 
     Args:
-
     - split: name to prefix metric (like train, eval, test...)
     - start_time: operation start time
     - num_samples: number of samples processed
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index 8d81d76c4fd166..636cf785ea94ea 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -120,7 +120,6 @@ def update(self, value: int, force_update: bool = False, comment: str = None):
         The main method to update the progress bar to `value`.
 
         Args:
-
             value (`int`):
                 The value to use. Must be between 0 and `total`.
             force_update (`bool`, *optional*, defaults to `False`):
@@ -204,7 +203,6 @@ class NotebookTrainingTracker(NotebookProgressBar):
     An object tracking the updates of an ongoing training with progress bars and a nice table reporting metrics.
 
     Args:
-
         num_steps (`int`): The number of steps during training. column_names (`List[str]`, *optional*):
             The list of column names for the metrics table (will be inferred from the first call to
             [`~utils.notebook.NotebookTrainingTracker.write_line`] if not set).

From 8a8a9a1cafce664e4dbcd087165435ebf9b47cde Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 11 Aug 2022 10:47:19 -0400
Subject: [PATCH 117/162] Bump nbconvert from 6.0.1 to 6.3.0 in
 /examples/research_projects/lxmert (#18565)

Bumps [nbconvert](https://github.com/jupyter/nbconvert) from 6.0.1 to 6.3.0.
- [Release notes](https://github.com/jupyter/nbconvert/releases)
- [Commits](https://github.com/jupyter/nbconvert/compare/6.0.1...6.3.0)

---
updated-dependencies:
- dependency-name: nbconvert
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/lxmert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt
index 28a15ccb6ada21..9e7cc88ce0828f 100644
--- a/examples/research_projects/lxmert/requirements.txt
+++ b/examples/research_projects/lxmert/requirements.txt
@@ -43,7 +43,7 @@ matplotlib==3.3.1
 mistune==2.0.3
 msgpack==0.6.2
 nbclient==0.5.0
-nbconvert==6.0.1
+nbconvert==6.3.0
 nbformat==5.0.7
 nest-asyncio==1.4.0
 notebook==6.4.12

From 5a4679923c2d9251aae1a4cd0bb15259dedeee6b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 11 Aug 2022 10:47:31 -0400
Subject: [PATCH 118/162] Bump nbconvert in
 /examples/research_projects/visual_bert (#18566)

Bumps [nbconvert](https://github.com/jupyter/nbconvert) from 6.0.1 to 6.3.0.
- [Release notes](https://github.com/jupyter/nbconvert/releases)
- [Commits](https://github.com/jupyter/nbconvert/compare/6.0.1...6.3.0)

---
updated-dependencies:
- dependency-name: nbconvert
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/visual_bert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt
index 28a15ccb6ada21..9e7cc88ce0828f 100644
--- a/examples/research_projects/visual_bert/requirements.txt
+++ b/examples/research_projects/visual_bert/requirements.txt
@@ -43,7 +43,7 @@ matplotlib==3.3.1
 mistune==2.0.3
 msgpack==0.6.2
 nbclient==0.5.0
-nbconvert==6.0.1
+nbconvert==6.3.0
 nbformat==5.0.7
 nest-asyncio==1.4.0
 notebook==6.4.12

From 5d1df723abaa6b7963150338edf433a572bb24ce Mon Sep 17 00:00:00 2001
From: Alara Dirik <8944735+alaradirik@users.noreply.github.com>
Date: Thu, 11 Aug 2022 19:10:25 +0300
Subject: [PATCH 119/162] fix owlvit tests, update docstring examples (#18586)

---
 docs/source/en/model_doc/owlvit.mdx               | 4 ++--
 src/transformers/models/owlvit/modeling_owlvit.py | 4 ++--
 tests/models/owlvit/test_modeling_owlvit.py       | 7 +++----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/model_doc/owlvit.mdx b/docs/source/en/model_doc/owlvit.mdx
index 0b61d7b274a0c7..ddbc2826d7a655 100644
--- a/docs/source/en/model_doc/owlvit.mdx
+++ b/docs/source/en/model_doc/owlvit.mdx
@@ -57,8 +57,8 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
 ...     box = [round(i, 2) for i in box.tolist()]
 ...     if score >= score_threshold:
 ...         print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
-Detected a photo of a cat with confidence 0.243 at location [1.42, 50.69, 308.58, 370.48]
-Detected a photo of a cat with confidence 0.298 at location [348.06, 20.56, 642.33, 372.61]
+Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
+Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
 ```
 
 This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 73ee2597f1b163..c0386ab23d3fba 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1323,8 +1323,8 @@ def forward(
         ...     box = [round(i, 2) for i in box.tolist()]
         ...     if score >= score_threshold:
         ...         print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
-        Detected a photo of a cat with confidence 0.243 at location [1.42, 50.69, 308.58, 370.48]
-        Detected a photo of a cat with confidence 0.298 at location [348.06, 20.56, 642.33, 372.61]
+        Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
+        Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
         ```"""
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 7564d192ad9898..e8f615ec8e54f0 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -733,7 +733,6 @@ def prepare_img():
 
 @require_vision
 @require_torch
-@unittest.skip("These tests are broken, fix me Alara")
 class OwlViTModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
@@ -763,8 +762,7 @@ def test_inference(self):
             outputs.logits_per_text.shape,
             torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
         )
-        expected_logits = torch.tensor([[4.4420, 0.6181]], device=torch_device)
-
+        expected_logits = torch.tensor([[3.4613, 0.9403]], device=torch_device)
         self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
 
     @slow
@@ -788,7 +786,8 @@ def test_inference_object_detection(self):
 
         num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
         self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
+
         expected_slice_boxes = torch.tensor(
-            [[0.0948, 0.0471, 0.1915], [0.3194, 0.0583, 0.6498], [0.1441, 0.0452, 0.2197]]
+            [[0.0691, 0.0445, 0.1373], [0.1592, 0.0456, 0.3192], [0.1632, 0.0423, 0.2478]]
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))

From f03866fd5c714846a1792d38fdb6b9e42ce7985b Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 11 Aug 2022 17:32:11 +0100
Subject: [PATCH 120/162] Return the permuted hidden states if return_dict=True
 (#18578)

---
 src/transformers/models/convnext/modeling_tf_convnext.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 405aeff6e0bdd5..0be2d291923812 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -330,7 +330,8 @@ def call(
             hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
 
         if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+            hidden_states = hidden_states if output_hidden_states else ()
+            return (last_hidden_state, pooled_output) + hidden_states
 
         return TFBaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,

From 261f48072c5c7088ee7caec70e5b7887cba456d0 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 12 Aug 2022 09:48:10 +0200
Subject: [PATCH 121/162] Load sharded pt to flax (#18419)

* initial commit

* add small test

* add cross pt tf flag to test

* fix quality

* style

* update test with new repo

* fix failing test

* update

* fix wrong param ordering

* style

* update based on review

* update related to recent new caching mechanism

* quality

* Update based on review

Co-authored-by: sgugger <sylvain.gugger@gmail.com>

* quality and style

* Update src/transformers/modeling_flax_utils.py
Co-authored-by: sgugger <sylvain.gugger@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../modeling_flax_pytorch_utils.py            | 74 +++++++++++++++++--
 src/transformers/modeling_flax_utils.py       | 20 ++++-
 tests/test_modeling_flax_common.py            |  8 ++
 3 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index a91d41b9d6d91b..76eaa53f89d04c 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -38,7 +38,9 @@
 #####################
 
 
-def load_pytorch_checkpoint_in_flax_state_dict(flax_model, pytorch_checkpoint_path, allow_missing_keys=False):
+def load_pytorch_checkpoint_in_flax_state_dict(
+    flax_model, pytorch_checkpoint_path, is_sharded, allow_missing_keys=False
+):
     """Load pytorch checkpoints in a flax model"""
     try:
         import torch  # noqa: F401
@@ -50,14 +52,17 @@ def load_pytorch_checkpoint_in_flax_state_dict(flax_model, pytorch_checkpoint_pa
         )
         raise
 
-    pt_path = os.path.abspath(pytorch_checkpoint_path)
-    logger.info(f"Loading PyTorch weights from {pt_path}")
+    if not is_sharded:
+        pt_path = os.path.abspath(pytorch_checkpoint_path)
+        logger.info(f"Loading PyTorch weights from {pt_path}")
 
-    pt_state_dict = torch.load(pt_path, map_location="cpu")
-    logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.")
-
-    flax_state_dict = convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model)
+        pt_state_dict = torch.load(pt_path, map_location="cpu")
+        logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.")
 
+        flax_state_dict = convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model)
+    else:
+        # model is sharded and pytorch_checkpoint_path already contains the list of .pt shard files
+        flax_state_dict = convert_pytorch_sharded_state_dict_to_flax(pytorch_checkpoint_path, flax_model)
     return flax_state_dict
 
 
@@ -156,6 +161,61 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
     return unflatten_dict(flax_state_dict)
 
 
+############################
+# Sharded Pytorch => Flax #
+############################
+
+
+def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
+    import torch
+
+    # Load the index
+    flax_state_dict = {}
+    for shard_file in shard_filenames:
+        # load using msgpack utils
+        pt_state_dict = torch.load(shard_file)
+        pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
+
+        model_prefix = flax_model.base_model_prefix
+        random_flax_state_dict = flatten_dict(flax_model.params)
+
+        load_model_with_head_into_base_model = (model_prefix not in flax_model.params) and (
+            model_prefix in set([k.split(".")[0] for k in pt_state_dict.keys()])
+        )
+        load_base_model_into_model_with_head = (model_prefix in flax_model.params) and (
+            model_prefix not in set([k.split(".")[0] for k in pt_state_dict.keys()])
+        )
+        # Need to change some parameters name to match Flax names
+        for pt_key, pt_tensor in pt_state_dict.items():
+
+            pt_tuple_key = tuple(pt_key.split("."))
+
+            # remove base model prefix if necessary
+            has_base_model_prefix = pt_tuple_key[0] == model_prefix
+            if load_model_with_head_into_base_model and has_base_model_prefix:
+                pt_tuple_key = pt_tuple_key[1:]
+
+            # Correctly rename weight parameters
+            flax_key, flax_tensor = rename_key_and_reshape_tensor(
+                pt_tuple_key, pt_tensor, random_flax_state_dict, model_prefix
+            )
+            # add model prefix if necessary
+            require_base_model_prefix = (model_prefix,) + flax_key in random_flax_state_dict
+            if load_base_model_into_model_with_head and require_base_model_prefix:
+                flax_key = (model_prefix,) + flax_key
+
+            if flax_key in random_flax_state_dict:
+                if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
+                    raise ValueError(
+                        f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
+                        f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
+                    )
+
+            # also add unexpected weight so that warning is thrown
+            flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
+    return unflatten_dict(flax_state_dict)
+
+
 #####################
 # Flax => PyTorch #
 #####################
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 683e25631c0f44..00bb5480ffe3e9 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -40,6 +40,7 @@
 from .utils import (
     FLAX_WEIGHTS_INDEX_NAME,
     FLAX_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
     PushToHubMixin,
     add_code_sample_docstrings,
@@ -650,6 +651,10 @@ def from_pretrained(
                 if from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
                     # Load from a PyTorch checkpoint
                     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME)):
+                    # Load from a sharded pytorch checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME)
+                    is_sharded = True
                 elif os.path.isfile(os.path.join(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME)):
                     # Load from a Flax checkpoint
                     archive_file = os.path.join(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME)
@@ -700,6 +705,13 @@ def from_pretrained(
                         )
                         if resolved_archive_file is not None:
                             is_sharded = True
+                    # Maybe the checkpoint is pytorch sharded, we try to grab the pytorch index name in this case.
+                    elif resolved_archive_file is None and from_pt:
+                        resolved_archive_file = cached_file(
+                            pretrained_model_name_or_path, WEIGHTS_INDEX_NAME, **cached_file_kwargs
+                        )
+                        if resolved_archive_file is not None:
+                            is_sharded = True
                     if resolved_archive_file is None:
                         # Otherwise, maybe there is a TF or Flax model file.  We try those to give a helpful error
                         # message.
@@ -714,6 +726,12 @@ def from_pretrained(
                                 f" {FLAX_WEIGHTS_NAME} but there is a file for PyTorch weights. Use `from_pt=True` to"
                                 " load this model from those weights."
                             )
+                        elif has_file(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME, **has_file_kwargs):
+                            raise EnvironmentError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {FLAX_WEIGHTS_INDEX_NAME} but there is a sharded file for PyTorch weights. Use"
+                                " `from_pt=True` to load this model from those weights."
+                            )
                         else:
                             raise EnvironmentError(
                                 f"{pretrained_model_name_or_path} does not appear to have a file named"
@@ -761,7 +779,7 @@ def from_pretrained(
         model = cls(config, *model_args, _do_init=_do_init, **model_kwargs)
 
         if from_pt:
-            state = load_pytorch_checkpoint_in_flax_state_dict(model, resolved_archive_file)
+            state = load_pytorch_checkpoint_in_flax_state_dict(model, resolved_archive_file, is_sharded)
         else:
 
             if is_sharded:
diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py
index e22c7e6705b3bd..837f874889ae7d 100644
--- a/tests/test_modeling_flax_common.py
+++ b/tests/test_modeling_flax_common.py
@@ -1099,6 +1099,14 @@ def test_checkpoint_sharding_local(self):
                 for p1, p2 in zip(flatten_dict(model.params).values(), flatten_dict(new_model.params).values()):
                     self.assertTrue(np.allclose(np.array(p1), np.array(p2)))
 
+    @is_pt_flax_cross_test
+    def test_from_sharded_pt(self):
+        model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
+        ref_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-fx-only")
+        for key, ref_val in flatten_dict(ref_model.params).items():
+            val = flatten_dict(model.params)[key]
+            assert np.allclose(np.array(val), np.array(ref_val))
+
     def test_gradient_checkpointing(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 

From ff90f49662fe527d02dc968c696890e144c5e128 Mon Sep 17 00:00:00 2001
From: Ian Castillo <7807897+donelianc@users.noreply.github.com>
Date: Fri, 12 Aug 2022 13:11:28 +0200
Subject: [PATCH 122/162] Add type hints for ViLT models (#18577)

* Add type hints for Vilt models

* Add missing return type for TokenClassification class
---
 src/transformers/models/vilt/modeling_vilt.py | 158 +++++++++---------
 1 file changed, 79 insertions(+), 79 deletions(-)

diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 308358850c9808..dab78c0bce8687 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -17,7 +17,7 @@
 import collections.abc
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -761,19 +761,19 @@ class PreTrainedModel
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        pixel_values=None,
-        pixel_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        image_embeds=None,
-        image_token_type_idx=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        image_token_type_idx: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutputWithPooling, Tuple[torch.FloatTensor]]:
         r"""
         Returns:
 
@@ -914,19 +914,19 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        pixel_values=None,
-        pixel_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        image_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
@@ -1088,19 +1088,19 @@ def __init__(self, config):
     @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        pixel_values=None,
-        pixel_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        image_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
             Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
@@ -1193,19 +1193,19 @@ def __init__(self, config):
     @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        pixel_values=None,
-        pixel_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        image_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels are currently not supported.
@@ -1299,19 +1299,19 @@ def __init__(self, config):
     @replace_return_docstrings(output_type=ViltForImagesAndTextClassificationOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        pixel_values=None,
-        pixel_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        image_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[ViltForImagesAndTextClassificationOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Binary classification labels.
@@ -1436,19 +1436,19 @@ def __init__(self, config):
     @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        pixel_values=None,
-        pixel_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        image_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[TokenClassifierOutput, Tuple[torch.FloatTensor]]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

From 1e7062af4ae1883f217b4ff143841ee2df88f5d7 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Fri, 12 Aug 2022 20:36:27 +0800
Subject: [PATCH 123/162] update doc for perf_train_cpu_many, add intel mpi
 introduction (#18576)

* update doc for perf_train_cpu_many, add mpi introduction

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* Update docs/source/en/perf_train_cpu_many.mdx

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update docs/source/en/perf_train_cpu_many.mdx

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/perf_train_cpu_many.mdx | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/perf_train_cpu_many.mdx b/docs/source/en/perf_train_cpu_many.mdx
index 5705517f5b1b4a..f4f77965748e3e 100644
--- a/docs/source/en/perf_train_cpu_many.mdx
+++ b/docs/source/en/perf_train_cpu_many.mdx
@@ -36,8 +36,22 @@ pip install oneccl_bind_pt=={pytorch_version} -f https://software.intel.com/ipex
 ```
 where `{pytorch_version}` should be your PyTorch version, for instance 1.12.0.
 Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl).
+Versions of oneCCL and PyTorch must match.
 
-### Usage in Trainer
+## Intel® MPI library
+Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit.
+It can be installed via [MPI](https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html#mpi).
+
+Please set the environment by following command before using it.
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example.
+
+
+## Usage in Trainer
 To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--xpu_backend ccl`** in the command arguments.
 
 Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)

From c472b595855a888376653cb7936d03647bd37f6d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 12 Aug 2022 05:40:53 -0700
Subject: [PATCH 124/162] typos (#18594)

---
 src/transformers/generation_utils.py          | 2 +-
 src/transformers/models/fsmt/modeling_fsmt.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 8f6dc6a383a774..637b723c88de20 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1200,7 +1200,7 @@ def generate(
         input_ids_seq_length = input_ids.shape[-1]
         if max_length is None and max_new_tokens is None:
             warnings.warn(
-                "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
+                "Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to "
                 f"{self.config.max_length} (`self.config.max_length`). Controlling `max_length` via the config is "
                 "deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend "
                 "using `max_new_tokens` to control the maximum length of the generation.",
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index d44bc80363d09e..f469266d7454f6 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -220,7 +220,7 @@
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            IIndices can be obtained using [`FSTMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`FSTMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)

From 8cd549f0ff46a3a106f5a52a18e61d8903f7e5d4 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 12 Aug 2022 18:18:37 +0530
Subject: [PATCH 125/162] FSDP bug fix for `load_state_dict` (#18596)

---
 src/transformers/trainer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index e537b3b6357adb..64d5a3fadf4d6d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1935,7 +1935,9 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
             else:
                 # We load the model state dict on the CPU to avoid an OOM error.
                 state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu")
-                load_result = model.load_state_dict(state_dict, strict=False)
+                # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
+                # which takes *args instead of **kwargs
+                load_result = model.load_state_dict(state_dict, False)
                 # release memory
                 del state_dict
                 self._issue_warnings_after_load(load_result)
@@ -1989,7 +1991,9 @@ def _load_best_model(self):
                     # We load the model state dict on the CPU to avoid an OOM error.
                     state_dict = torch.load(best_model_path, map_location="cpu")
                     # If the model is on the GPU, it still works!
-                    load_result = model.load_state_dict(state_dict, strict=False)
+                    # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
+                    # which takes *args instead of **kwargs
+                    load_result = model.load_state_dict(state_dict, False)
                 if not is_sagemaker_mp_enabled():
                     self._issue_warnings_after_load(load_result)
         elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):

From b0dea996490aedd8d74e69c01d1f1480eef2fe07 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 12 Aug 2022 15:10:00 +0200
Subject: [PATCH 126/162] Add `TFAutoModelForSemanticSegmentation` to the main
 `__init__.py` (#18600)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docs/source/en/model_doc/auto.mdx          | 4 ++++
 src/transformers/__init__.py               | 2 ++
 src/transformers/models/auto/__init__.py   | 2 ++
 src/transformers/utils/dummy_tf_objects.py | 7 +++++++
 4 files changed, 15 insertions(+)

diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.mdx
index 67fc81d280a79b..995296485b9402 100644
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@@ -186,6 +186,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
 
 [[autodoc]] TFAutoModelForImageClassification
 
+## TFAutoModelForSemanticSegmentation
+
+[[autodoc]] TFAutoModelForSemanticSegmentation
+
 ## TFAutoModelForMaskedLM
 
 [[autodoc]] TFAutoModelForMaskedLM
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 28ec6a17b3a323..7d31bfac67239f 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2107,6 +2107,7 @@
             "TFAutoModelForNextSentencePrediction",
             "TFAutoModelForPreTraining",
             "TFAutoModelForQuestionAnswering",
+            "TFAutoModelForSemanticSegmentation",
             "TFAutoModelForSeq2SeqLM",
             "TFAutoModelForSequenceClassification",
             "TFAutoModelForSpeechSeq2Seq",
@@ -4603,6 +4604,7 @@
             TFAutoModelForNextSentencePrediction,
             TFAutoModelForPreTraining,
             TFAutoModelForQuestionAnswering,
+            TFAutoModelForSemanticSegmentation,
             TFAutoModelForSeq2SeqLM,
             TFAutoModelForSequenceClassification,
             TFAutoModelForSpeechSeq2Seq,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 139d4feda336e0..ec253f6037a3d3 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -128,6 +128,7 @@
         "TFAutoModelForNextSentencePrediction",
         "TFAutoModelForPreTraining",
         "TFAutoModelForQuestionAnswering",
+        "TFAutoModelForSemanticSegmentation",
         "TFAutoModelForSeq2SeqLM",
         "TFAutoModelForSequenceClassification",
         "TFAutoModelForSpeechSeq2Seq",
@@ -271,6 +272,7 @@
             TFAutoModelForNextSentencePrediction,
             TFAutoModelForPreTraining,
             TFAutoModelForQuestionAnswering,
+            TFAutoModelForSemanticSegmentation,
             TFAutoModelForSeq2SeqLM,
             TFAutoModelForSequenceClassification,
             TFAutoModelForSpeechSeq2Seq,
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 6df601ca646af3..fec5ffe700808a 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -362,6 +362,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForSemanticSegmentation(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForSeq2SeqLM(metaclass=DummyObject):
     _backends = ["tf"]
 

From b93957a98e944d83f263e21d443a228fef8d52cd Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Fri, 12 Aug 2022 14:53:51 +0100
Subject: [PATCH 127/162] Generate: validate `model_kwargs` (and catch typos in
 generate arguments) (#18261)

* validate generate model_kwargs

* generate tests -- not all models have an attn mask
---
 src/transformers/generation_utils.py      |  26 +++++
 tests/generation/test_generation_utils.py | 113 +++++++++++++---------
 2 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 637b723c88de20..b5b042e718c1c3 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -841,6 +841,29 @@ def compute_transition_beam_scores(
 
         return transition_scores
 
+    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+        """Validates model kwargs for generation. Generate argument typos will also be caught here."""
+        # Excludes arguments that are handled before calling any model function
+        if self.config.is_encoder_decoder:
+            for key in ["decoder_input_ids"]:
+                model_kwargs.pop(key, None)
+
+        unused_model_args = []
+        model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
+        # `kwargs` if often used to handle optional forward pass inputs like `attention_mask`. If
+        # `prepare_inputs_for_generation` doesn't accept `kwargs`, then a stricter check can be made ;)
+        if "kwargs" in model_args:
+            model_args |= set(inspect.signature(self.forward).parameters)
+        for key, value in model_kwargs.items():
+            if value is not None and key not in model_args:
+                unused_model_args.append(key)
+
+        if unused_model_args:
+            raise ValueError(
+                f"The following `model_kwargs` are not used by the model: {unused_model_args} (note: typos in the"
+                " generate arguments will also show up in this list)"
+            )
+
     @torch.no_grad()
     def generate(
         self,
@@ -1120,6 +1143,9 @@ def generate(
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ['Paris ist eines der dichtesten besiedelten Gebiete Europas.']
         ```"""
+        # 0. Validate model kwargs
+        self._validate_model_kwargs(model_kwargs.copy())
+
         # 1. Set generation parameters if not already defined
         bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
         num_beams = num_beams if num_beams is not None else self.config.num_beams
diff --git a/tests/generation/test_generation_utils.py b/tests/generation/test_generation_utils.py
index 56227403ae60b9..ba13669368d228 100644
--- a/tests/generation/test_generation_utils.py
+++ b/tests/generation/test_generation_utils.py
@@ -75,21 +75,25 @@ class GenerationTesterMixin:
 
     def _get_input_ids_and_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
         input_ids = inputs_dict[self.input_name]
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long)
 
         # cut to half length & take max batch_size 3
         max_batch_size = 2
         sequence_length = input_ids.shape[-1] // 2
         input_ids = input_ids[:max_batch_size, :sequence_length]
-        attention_mask = attention_mask[:max_batch_size, :sequence_length]
 
         # generate max 3 tokens
         max_length = input_ids.shape[-1] + 3
         if config.eos_token_id is not None and config.pad_token_id is None:
             # hack to allow generate for models such as GPT2 as is done in `generate()`
             config.pad_token_id = config.eos_token_id
+
+        # TransfoXL has no attention mask
+        if "transfoxl" in config.__class__.__name__.lower():
+            attention_mask = None
+        else:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:max_batch_size, :sequence_length]
+
         return config, input_ids, attention_mask, max_length
 
     @staticmethod
@@ -252,10 +256,9 @@ def _greedy_generate(
         )
 
         kwargs = {}
-
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
             input_ids,
-            attention_mask=attention_mask,
             do_sample=False,
             num_beams=1,
             max_length=max_length,
@@ -265,6 +268,7 @@ def _greedy_generate(
             return_dict_in_generate=return_dict_in_generate,
             remove_invalid_values=True,
             **logits_process_kwargs,
+            **model_kwargs,
         )
 
         if model.config.is_encoder_decoder:
@@ -278,16 +282,17 @@ def _greedy_generate(
             kwargs["encoder_outputs"] = encoder_outputs
 
         with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_greedy = model.greedy_search(
                 input_ids,
                 max_length=max_length,
-                attention_mask=attention_mask,
                 logits_processor=logits_processor,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 output_scores=output_scores,
                 return_dict_in_generate=return_dict_in_generate,
                 **kwargs,
+                **model_kwargs,
             )
         return output_greedy, output_generate
 
@@ -308,13 +313,13 @@ def _sample_generate(
         return_dict_in_generate=False,
     ):
         torch.manual_seed(0)
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
             input_ids,
             do_sample=True,
             num_beams=1,
             max_length=max_length,
             num_return_sequences=num_return_sequences,
-            attention_mask=attention_mask,
             output_scores=output_scores,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -327,7 +332,7 @@ def _sample_generate(
         torch.manual_seed(0)
         kwargs = {}
         if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids_clone, attention_mask_clone = self._get_encoder_outputs(
+            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
                 model,
                 input_ids,
                 attention_mask,
@@ -336,18 +341,16 @@ def _sample_generate(
                 output_hidden_states=output_hidden_states,
             )
             kwargs["encoder_outputs"] = encoder_outputs
-            input_ids_clone = input_ids_clone.repeat_interleave(num_return_sequences, dim=0)
-        else:
-            attention_mask_clone = attention_mask.repeat_interleave(num_return_sequences, dim=0)
-            input_ids_clone = input_ids.repeat_interleave(num_return_sequences, dim=0)
+        elif attention_mask is not None:
+            attention_mask = attention_mask.repeat_interleave(num_return_sequences, dim=0)
 
         # prevent flaky generation test failures
         logits_processor.append(InfNanRemoveLogitsProcessor())
 
         with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_sample = model.sample(
-                input_ids_clone,
-                attention_mask=attention_mask_clone,
+                input_ids.repeat_interleave(num_return_sequences, dim=0),
                 max_length=max_length,
                 logits_processor=logits_processor,
                 logits_warper=logits_warper,
@@ -356,6 +359,7 @@ def _sample_generate(
                 output_hidden_states=output_hidden_states,
                 return_dict_in_generate=return_dict_in_generate,
                 **kwargs,
+                **model_kwargs,
             )
         return output_sample, output_generate
 
@@ -374,9 +378,9 @@ def _beam_search_generate(
         output_hidden_states=False,
         return_dict_in_generate=False,
     ):
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
             input_ids,
-            attention_mask=attention_mask,
             do_sample=False,
             max_length=max_length,
             output_scores=output_scores,
@@ -386,12 +390,13 @@ def _beam_search_generate(
             remove_invalid_values=True,
             **beam_kwargs,
             **logits_process_kwargs,
+            **model_kwargs,
         )
 
         # beam_search does not automatically interleave `batch_size` dim for `num_beams`
         kwargs = {}
         if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids_clone, attention_mask_clone = self._get_encoder_outputs(
+            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
                 model,
                 input_ids,
                 attention_mask,
@@ -400,23 +405,22 @@ def _beam_search_generate(
                 output_hidden_states=output_hidden_states,
             )
             kwargs["encoder_outputs"] = encoder_outputs
-            input_ids_clone = input_ids_clone.repeat_interleave(beam_scorer.num_beams, dim=0)
-        else:
-            attention_mask_clone = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
-            input_ids_clone = input_ids.repeat_interleave(beam_scorer.num_beams, dim=0)
+        elif attention_mask is not None:
+            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
 
         with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_beam_search = model.beam_search(
-                input_ids_clone,
+                input_ids.repeat_interleave(beam_scorer.num_beams, dim=0),
                 beam_scorer,
                 max_length=max_length,
-                attention_mask=attention_mask_clone,
                 logits_processor=logits_processor,
                 output_scores=output_scores,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict_in_generate=return_dict_in_generate,
                 **kwargs,
+                **model_kwargs,
             )
         return output_generate, output_beam_search
 
@@ -437,9 +441,9 @@ def _beam_sample_generate(
         return_dict_in_generate=False,
     ):
         torch.manual_seed(0)
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
             input_ids,
-            attention_mask=attention_mask,
             do_sample=True,
             max_length=max_length,
             output_scores=output_scores,
@@ -449,6 +453,7 @@ def _beam_sample_generate(
             remove_invalid_values=True,
             **beam_kwargs,
             **logits_warper_kwargs,
+            **model_kwargs,
         )
         # beam_search does not automatically interleave `batch_size` dim for `num_beams * num_return_sequences`
         kwargs = {}
@@ -462,7 +467,7 @@ def _beam_sample_generate(
                 output_hidden_states=output_hidden_states,
             )
             kwargs["encoder_outputs"] = encoder_outputs
-        else:
+        elif attention_mask is not None:
             attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams * num_return_sequences, dim=0)
 
         # prevent flaky generation test failures
@@ -471,11 +476,11 @@ def _beam_sample_generate(
 
         torch.manual_seed(0)
         with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_beam_sample = model.beam_sample(
                 input_ids.repeat_interleave(beam_scorer.num_beams * num_return_sequences, dim=0),
                 beam_scorer,
                 max_length=max_length,
-                attention_mask=attention_mask,
                 logits_warper=logits_warper,
                 logits_processor=logits_processor,
                 output_scores=output_scores,
@@ -483,6 +488,7 @@ def _beam_sample_generate(
                 output_hidden_states=output_hidden_states,
                 return_dict_in_generate=return_dict_in_generate,
                 **kwargs,
+                **model_kwargs,
             )
 
         return output_generate, output_beam_sample
@@ -502,9 +508,9 @@ def _group_beam_search_generate(
         output_hidden_states=False,
         return_dict_in_generate=False,
     ):
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
             input_ids,
-            attention_mask=attention_mask,
             do_sample=False,
             max_length=max_length,
             output_scores=output_scores,
@@ -514,12 +520,13 @@ def _group_beam_search_generate(
             remove_invalid_values=True,
             **beam_kwargs,
             **logits_process_kwargs,
+            **model_kwargs,
         )
 
         # group_beam_search does not automatically interleave `batch_size` dim for `num_beams`
         kwargs = {}
         if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids_clone, attention_mask_clone = self._get_encoder_outputs(
+            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
                 model,
                 input_ids,
                 attention_mask,
@@ -528,23 +535,22 @@ def _group_beam_search_generate(
                 output_hidden_states=output_hidden_states,
             )
             kwargs["encoder_outputs"] = encoder_outputs
-            input_ids_clone = input_ids_clone.repeat_interleave(beam_scorer.num_beams, dim=0)
-        else:
-            attention_mask_clone = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
-            input_ids_clone = input_ids.repeat_interleave(beam_scorer.num_beams, dim=0)
+        elif attention_mask is not None:
+            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
 
         with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_group_beam_search = model.group_beam_search(
-                input_ids_clone,
+                input_ids.repeat_interleave(beam_scorer.num_beams, dim=0),
                 beam_scorer,
                 max_length=max_length,
-                attention_mask=attention_mask_clone,
                 logits_processor=logits_processor,
                 output_scores=output_scores,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict_in_generate=return_dict_in_generate,
                 **kwargs,
+                **model_kwargs,
             )
         return output_generate, output_group_beam_search
 
@@ -564,9 +570,9 @@ def _constrained_beam_search_generate(
         output_hidden_states=False,
         return_dict_in_generate=False,
     ):
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
         output_generate = model.generate(
             input_ids,
-            attention_mask=attention_mask,
             do_sample=False,
             max_length=max_length,
             output_scores=output_scores,
@@ -577,12 +583,13 @@ def _constrained_beam_search_generate(
             constraints=constraints,
             **beam_kwargs,
             **logits_process_kwargs,
+            **model_kwargs,
         )
 
         # group_beam_search does not automatically interleave `batch_size` dim for `num_beams`
         kwargs = {}
         if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids_clone, attention_mask_clone = self._get_encoder_outputs(
+            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
                 model,
                 input_ids,
                 attention_mask,
@@ -591,23 +598,22 @@ def _constrained_beam_search_generate(
                 output_hidden_states=output_hidden_states,
             )
             kwargs["encoder_outputs"] = encoder_outputs
-            input_ids_clone = input_ids_clone.repeat_interleave(constrained_beam_scorer.num_beams, dim=0)
-        else:
-            attention_mask_clone = attention_mask.repeat_interleave(constrained_beam_scorer.num_beams, dim=0)
-            input_ids_clone = input_ids.repeat_interleave(constrained_beam_scorer.num_beams, dim=0)
+        elif attention_mask is not None:
+            attention_mask = attention_mask.repeat_interleave(constrained_beam_scorer.num_beams, dim=0)
 
         with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_group_beam_search = model.constrained_beam_search(
-                input_ids_clone,
+                input_ids.repeat_interleave(constrained_beam_scorer.num_beams, dim=0),
                 constrained_beam_scorer,
                 max_length=max_length,
-                attention_mask=attention_mask_clone,
                 logits_processor=logits_processor,
                 output_scores=output_scores,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict_in_generate=return_dict_in_generate,
                 **kwargs,
+                **model_kwargs,
             )
         return output_generate, output_group_beam_search
 
@@ -1044,12 +1050,7 @@ def test_generate_without_input_ids(self):
             model = model_class(config).to(torch_device)
             model.eval()
 
-            output_ids_generate = model.generate(
-                do_sample=False,
-                max_length=max_length,
-                remove_invalid_values=True,
-            )
-
+            output_ids_generate = model.generate(do_sample=False, max_length=max_length, remove_invalid_values=True)
             self.assertIsNotNone(output_ids_generate)
 
     def test_group_beam_search_generate(self):
@@ -2052,7 +2053,7 @@ def test_max_new_tokens_decoder_only(self):
 
         # max_new_tokens and max_length serve the same purpose and must not be used together.
         with self.assertRaises(ValueError):
-            gpt2_model.generate(decoder_input_ids=input_ids, max_new_tokens=10, max_length=20)
+            gpt2_model.generate(input_ids=input_ids, max_new_tokens=10, max_length=20)
 
     def test_encoder_decoder_generate_with_inputs_embeds(self):
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
@@ -2699,3 +2700,19 @@ def test_constrained_beam_search_mixin_type_checks(self):
 
         with self.assertRaises(ValueError):
             model.generate(input_ids, force_words_ids=[[[-1]]])
+
+    def test_validate_generation_inputs(self):
+        tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/t5-tiny-random")
+        model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/t5-tiny-random")
+
+        encoder_input_str = "Hello world"
+        input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+        # typos are quickly detected (the correct argument is `do_sample`)
+        with self.assertRaisesRegex(ValueError, "do_samples"):
+            model.generate(input_ids, do_samples=True)
+
+        # arbitrary arguments that will not be used anywhere are also not accepted
+        with self.assertRaisesRegex(ValueError, "foo"):
+            fake_model_kwargs = {"foo": "bar"}
+            model.generate(input_ids, **fake_model_kwargs)

From b881653f74731d7b474e06e1200bc5cc218b52ad Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 12 Aug 2022 16:15:09 +0200
Subject: [PATCH 128/162] Supporting seq2seq models for `bitsandbytes`
 integration (#18579)

* Supporting seq2seq models for `bitsandbytes` integration

- `bitsandbytes` integration supports now seq2seq models
- check if a model has tied weights as an additional check

* small modification

- tie the weights before looking at tied weights!
---
 src/transformers/utils/bitsandbytes.py | 14 +++++++++++++-
 tests/mixed_int8/test_mixed_int8.py    | 22 ++++++++++++++++++++--
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index ee4e52d421fd09..eca605b2edef94 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -1,3 +1,5 @@
+from copy import deepcopy
+
 from transformers.utils import is_accelerate_available, is_bitsandbytes_available
 
 
@@ -9,6 +11,7 @@
 
 if is_accelerate_available():
     from accelerate import init_empty_weights
+    from accelerate.utils import find_tied_parameters
 
 
 def set_module_8bit_tensor_to_device(module, tensor_name, device, value=None):
@@ -132,8 +135,17 @@ def get_key_to_not_convert(model):
     model (`torch.nn.Module`):
         Input model
     """
+    # Create a copy of the model and tie the weights, then
+    # check if it contains tied weights
+    tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`
+    tied_model.tie_weights()
+    has_tied_params = len(find_tied_parameters(tied_model)) > 0
+
+    # Check if it is a base model
+    is_base_model = not hasattr(model, model.base_model_prefix)
+
     # Ignore this for base models (BertModel, GPT2Model, etc.)
-    if not hasattr(model, model.base_model_prefix):
+    if (not has_tied_params) and is_base_model:
         return ""
 
     # otherwise they have an attached head
diff --git a/tests/mixed_int8/test_mixed_int8.py b/tests/mixed_int8/test_mixed_int8.py
index 0cd7ca16411c19..2911d67748809a 100644
--- a/tests/mixed_int8/test_mixed_int8.py
+++ b/tests/mixed_int8/test_mixed_int8.py
@@ -15,7 +15,14 @@
 import gc
 import unittest
 
-from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    pipeline,
+)
 from transformers.testing_utils import (
     is_torch_available,
     require_accelerate,
@@ -106,12 +113,21 @@ def setUp(self):
         super().setUp()
         # model_name
         self.model_name = "bigscience/bloom-560m"
-        # Models and tokenizer
+        self.seq_to_seq_name = "t5-small"
+
+        # Different types of model
+
         self.base_model = AutoModel.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        # Sequence classification model
         self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
             self.model_name, load_in_8bit=True, device_map="auto"
         )
+        # CausalLM model
         self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        # Seq2seq model
+        self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
+            self.seq_to_seq_name, load_in_8bit=True, device_map="auto"
+        )
 
     def tearDown(self):
         r"""
@@ -121,6 +137,7 @@ def tearDown(self):
         del self.base_model
         del self.sequence_model
         del self.model_8bit
+        del self.seq_to_seq_model
 
         gc.collect()
         torch.cuda.empty_cache()
@@ -138,6 +155,7 @@ def test_correct_head_class(self):
         # Other heads should be nn.Parameter
         self.assertTrue(self.model_8bit.lm_head.weight.__class__ == torch.nn.Parameter)
         self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
+        self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
 
 
 class MixedInt8TestPipeline(BaseMixedInt8Test):

From a9a0e18e881fb3541ffbf01ec35f422413b37228 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Fri, 12 Aug 2022 16:40:58 +0200
Subject: [PATCH 129/162] Add Donut (#18488)

* First draft

* Improve script

* Update script

* Make conversion work

* Add final_layer_norm attribute to Swin's config

* Add DonutProcessor

* Convert more models

* Improve feature extractor and convert base models

* Fix bug

* Improve integration tests

* Improve integration tests and add model to README

* Add doc test

* Add feature extractor to docs

* Fix integration tests

* Remove register_buffer

* Fix toctree and add missing attribute

* Add DonutSwin

* Make conversion script work

* Improve conversion script

* Address comment

* Fix bug

* Fix another bug

* Remove deprecated method from docs

* Make Swin and Swinv2 untouched

* Fix code examples

* Fix processor

* Update model_type to donut-swin

* Add feature extractor tests, add token2json method, improve feature extractor

* Fix failing tests, remove integration test

* Add do_thumbnail for consistency

* Improve code examples

* Add code example for document parsing

* Add DonutSwin to MODEL_NAMES_MAPPING

* Add model to appropriate place in toctree

* Update namespace to appropriate organization

Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
---
 README.md                                     |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.mdx                      |   2 +
 docs/source/en/model_doc/donut.mdx            | 214 ++++
 src/transformers/__init__.py                  |  12 +
 src/transformers/image_utils.py               |  22 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   5 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 src/transformers/models/auto/modeling_auto.py |   1 +
 .../models/auto/processing_auto.py            |   1 +
 src/transformers/models/donut/__init__.py     |  76 ++
 .../models/donut/configuration_donut_swin.py  | 140 +++
 .../models/donut/convert_donut_to_pytorch.py  | 234 +++++
 .../models/donut/feature_extraction_donut.py  | 208 ++++
 .../models/donut/modeling_donut_swin.py       | 941 ++++++++++++++++++
 .../models/donut/processing_donut.py          | 156 +++
 .../convert_trocr_unilm_to_pytorch.py         |   0
 src/transformers/utils/dummy_pt_objects.py    |  17 +
 .../utils/dummy_vision_objects.py             |   7 +
 src/transformers/utils/fx.py                  |   1 +
 tests/models/donut/__init__.py                |   0
 .../donut/test_feature_extraction_donut.py    | 203 ++++
 .../models/donut/test_modeling_donut_swin.py  | 464 +++++++++
 .../test_modeling_vision_encoder_decoder.py   | 216 +++-
 utils/check_copies.py                         |   1 +
 utils/check_repo.py                           |   1 +
 utils/documentation_tests.txt                 |   1 +
 31 files changed, 2924 insertions(+), 7 deletions(-)
 create mode 100644 docs/source/en/model_doc/donut.mdx
 create mode 100644 src/transformers/models/donut/__init__.py
 create mode 100644 src/transformers/models/donut/configuration_donut_swin.py
 create mode 100644 src/transformers/models/donut/convert_donut_to_pytorch.py
 create mode 100644 src/transformers/models/donut/feature_extraction_donut.py
 create mode 100644 src/transformers/models/donut/modeling_donut_swin.py
 create mode 100644 src/transformers/models/donut/processing_donut.py
 rename src/transformers/models/{vision_encoder_decoder => trocr}/convert_trocr_unilm_to_pytorch.py (100%)
 create mode 100644 tests/models/donut/__init__.py
 create mode 100644 tests/models/donut/test_feature_extraction_donut.py
 create mode 100644 tests/models/donut/test_modeling_donut_swin.py

diff --git a/README.md b/README.md
index 46a4b07c14cd32..30bc6d870bbf01 100644
--- a/README.md
+++ b/README.md
@@ -286,6 +286,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](https://huggingface.co/docs/transformers/main/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
diff --git a/README_ko.md b/README_ko.md
index c63fdca749da8f..cc0b790ad76a8d 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -242,6 +242,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](https://huggingface.co/docs/transformers/main/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 0ab06bd96ad99f..fe2fa45f71f39f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -266,6 +266,7 @@ conda install -c huggingface transformers
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) 和德语版 DistilBERT。
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (来自 Microsoft Research) 伴随论文 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 由 Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 发布。
+1. **[Donut](https://huggingface.co/docs/transformers/main/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (来自 Intel Labs) 伴随论文 [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) 由 René Ranftl, Alexey Bochkovskiy, Vladlen Koltun 发布。
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 90f29ad031b8b0..4f5a9954761494 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -278,6 +278,7 @@ conda install -c huggingface transformers
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](https://huggingface.co/docs/transformers/main/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 32ab4c6361d3a7..78137d2c8a74c1 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -427,6 +427,8 @@
         title: CLIP
       - local: model_doc/data2vec
         title: Data2Vec
+      - local: model_doc/donut
+        title: Donut
       - local: model_doc/flava
         title: FLAVA
       - local: model_doc/groupvit
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 5c0d51d8b7afb2..257eba8171ed1c 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -84,6 +84,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
@@ -224,6 +225,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          DonutSwin          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
new file mode 100644
index 00000000000000..9c9973be022e7c
--- /dev/null
+++ b/docs/source/en/model_doc/donut.mdx
@@ -0,0 +1,214 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# Donut
+
+## Overview
+
+The Donut model was proposed in [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by
+Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+Donut consists of an image Transformer encoder and an autoregressive text Transformer decoder to perform document understanding
+tasks such as document image classification, form understanding and visual question answering.
+
+The abstract from the paper is the following:
+
+*Understanding document images (e.g., invoices) is a core but challenging task since it requires complex functions such as reading text and a holistic understanding of the document. Current Visual Document Understanding (VDU) methods outsource the task of reading text to off-the-shelf Optical Character Recognition (OCR) engines and focus on the understanding task with the OCR outputs. Although such OCR-based approaches have shown promising performance, they suffer from 1) high computational costs for using OCR; 2) inflexibility of OCR models on languages or types of document; 3) OCR error propagation to the subsequent process. To address these issues, in this paper, we introduce a novel OCR-free VDU model named Donut, which stands for Document understanding transformer. As the first step in OCR-free VDU research, we propose a simple architecture (i.e., Transformer) with a pre-training objective (i.e., cross-entropy loss). Donut is conceptually simple yet effective. Through extensive experiments and analyses, we show a simple OCR-free VDU model, Donut, achieves state-of-the-art performances on various VDU tasks in terms of both speed and accuracy. In addition, we offer a synthetic data generator that helps the model pre-training to be flexible in various languages and domains.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/donut_architecture.jpg"
+alt="drawing" width="600"/> 
+
+<small> Donut high-level overview. Taken from the <a href="https://arxiv.org/abs/2111.15664">original paper</a>. </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
+[here](https://github.com/clovaai/donut).
+
+Tips:
+
+- The quickest way to get started with Donut is by checking the [tutorial
+  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/donut), which show how to use the model
+  at inference time as well as fine-tuning on custom data.
+- Donut is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
+
+## Inference
+
+Donut's [`VisionEncoderDecoder`] model accepts images as input and makes use of
+[`~generation_utils.GenerationMixin.generate`] to autoregressively generate text given the input image.
+
+The [`DonutFeatureExtractor`] class is responsible for preprocessing the input image and
+[`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] decodes the generated target tokens to the target string. The
+[`DonutProcessor`] wraps [`DonutFeatureExtractor`] and [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]
+into a single instance to both extract the input features and decode the predicted token ids.
+
+- Step-by-step Document Image Classification
+
+```py
+>>> import re
+
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # load document image
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[1]["image"]
+
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_rvlcdip>"
+>>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     early_stopping=True,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     num_beams=1,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(processor.token2json(sequence))
+{'class': 'advertisement'}
+```
+
+- Step-by-step Document Parsing
+
+```py
+>>> import re
+
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # load document image
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[2]["image"]
+
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_cord-v2>"
+>>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     early_stopping=True,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     num_beams=1,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(processor.token2json(sequence))
+{'menu': {'nm': 'CINNAMON SUGAR', 'unitprice': '17,000', 'cnt': '1 x', 'price': '17,000'}, 'sub_total': {'subtotal_price': '17,000'}, 'total': {'total_price': '17,000', 'cashprice': '20,000', 'changeprice': '3,000'}}
+```
+
+- Step-by-step Document Visual Question Answering (DocVQA)
+
+```py
+>>> import re
+
+>>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+>>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)  # doctest: +IGNORE_RESULT
+
+>>> # load document image from the DocVQA dataset
+>>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+>>> image = dataset[0]["image"]
+
+>>> # prepare decoder inputs
+>>> task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+>>> question = "When is the coffee break?"
+>>> prompt = task_prompt.replace("{user_input}", question)
+>>> decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+>>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+>>> outputs = model.generate(
+...     pixel_values.to(device),
+...     decoder_input_ids=decoder_input_ids.to(device),
+...     max_length=model.decoder.config.max_position_embeddings,
+...     early_stopping=True,
+...     pad_token_id=processor.tokenizer.pad_token_id,
+...     eos_token_id=processor.tokenizer.eos_token_id,
+...     use_cache=True,
+...     num_beams=1,
+...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+...     return_dict_in_generate=True,
+... )
+
+>>> sequence = processor.batch_decode(outputs.sequences)[0]
+>>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+>>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+>>> print(processor.token2json(sequence))
+{'question': 'When is the coffee break?', 'answer': '11-14 to 11:39 a.m.'}
+```
+
+See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
+
+## Training
+
+We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/donut).
+
+## DonutSwinConfig
+
+[[autodoc]] DonutSwinConfig
+
+## DonutFeatureExtractor
+
+[[autodoc]] DonutFeatureExtractor
+    - __call__
+
+## DonutProcessor
+
+[[autodoc]] DonutProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## DonutSwinModel
+
+[[autodoc]] DonutSwinModel
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7d31bfac67239f..41afbca1661129 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -190,6 +190,7 @@
     "models.dialogpt": [],
     "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
     "models.dit": [],
+    "models.donut": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutProcessor", "DonutSwinConfig"],
     "models.dpr": [
         "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "DPRConfig",
@@ -643,6 +644,7 @@
     _import_structure["models.convnext"].append("ConvNextFeatureExtractor")
     _import_structure["models.deit"].append("DeiTFeatureExtractor")
     _import_structure["models.detr"].append("DetrFeatureExtractor")
+    _import_structure["models.donut"].append("DonutFeatureExtractor")
     _import_structure["models.dpt"].append("DPTFeatureExtractor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaProcessor"])
     _import_structure["models.glpn"].append("GLPNFeatureExtractor")
@@ -1101,6 +1103,13 @@
             "DistilBertPreTrainedModel",
         ]
     )
+    _import_structure["models.donut"].extend(
+        [
+            "DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DonutSwinModel",
+            "DonutSwinPreTrainedModel",
+        ]
+    )
     _import_structure["models.dpr"].extend(
         [
             "DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2986,6 +2995,7 @@
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
+    from .models.donut import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutProcessor, DonutSwinConfig
     from .models.dpr import (
         DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DPRConfig,
@@ -3379,6 +3389,7 @@
         from .models.convnext import ConvNextFeatureExtractor
         from .models.deit import DeiTFeatureExtractor
         from .models.detr import DetrFeatureExtractor
+        from .models.donut import DonutFeatureExtractor
         from .models.dpt import DPTFeatureExtractor
         from .models.flava import FlavaFeatureExtractor, FlavaProcessor
         from .models.glpn import GLPNFeatureExtractor
@@ -3765,6 +3776,7 @@
             DistilBertModel,
             DistilBertPreTrainedModel,
         )
+        from .models.donut import DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST, DonutSwinModel, DonutSwinPreTrainedModel
         from .models.dpr import (
             DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
             DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index c84f6638f6e8b1..4d291c7d489508 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -466,3 +466,25 @@ def flip_channel_order(self, image):
             image = self.to_numpy_array(image)
 
         return image[::-1, :, :]
+
+    def rotate(self, image, angle, resample=PIL.Image.NEAREST, expand=0, center=None, translate=None, fillcolor=None):
+        """
+        Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
+        counter clockwise around its centre.
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
+                rotating.
+
+        Returns:
+            image: A rotated `PIL.Image.Image`.
+        """
+        self._ensure_format_supported(image)
+
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+
+        return image.rotate(
+            angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
+        )
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 11887db91f8393..fdf315b2257d8b 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -52,6 +52,7 @@
     dialogpt,
     distilbert,
     dit,
+    donut,
     dpr,
     dpt,
     electra,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c65a2762a00029..c9e6156a3843d3 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -56,6 +56,7 @@
         ("deit", "DeiTConfig"),
         ("detr", "DetrConfig"),
         ("distilbert", "DistilBertConfig"),
+        ("donut-swin", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
         ("dpt", "DPTConfig"),
         ("electra", "ElectraConfig"),
@@ -181,6 +182,7 @@
         ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("donut-swin", "DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -304,6 +306,8 @@
         ("dialogpt", "DialoGPT"),
         ("distilbert", "DistilBERT"),
         ("dit", "DiT"),
+        ("donut", "Donut"),
+        ("donut-swin", "DonutSwin"),
         ("dpr", "DPR"),
         ("dpt", "DPT"),
         ("electra", "ELECTRA"),
@@ -420,6 +424,7 @@
         ("data2vec-audio", "data2vec"),
         ("data2vec-text", "data2vec"),
         ("data2vec-vision", "data2vec"),
+        ("donut-swin", "donut"),
     ]
 )
 
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index db581d03d8fb7e..5c5f86d040c8fe 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -46,6 +46,7 @@
         ("deit", "DeiTFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
         ("detr", "DetrFeatureExtractor"),
+        ("donut", "DonutFeatureExtractor"),
         ("dpt", "DPTFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index bd4774c245b07b..0e026cb48d0c0d 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -56,6 +56,7 @@
         ("deit", "DeiTModel"),
         ("detr", "DetrModel"),
         ("distilbert", "DistilBertModel"),
+        ("donut-swin", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
         ("dpt", "DPTModel"),
         ("electra", "ElectraModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index aed7b4b9761373..c6f4fd98316a44 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -38,6 +38,7 @@
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("clip", "CLIPProcessor"),
+        ("donut", "DonutProcessor"),
         ("flava", "FlavaProcessor"),
         ("groupvit", "CLIPProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
diff --git a/src/transformers/models/donut/__init__.py b/src/transformers/models/donut/__init__.py
new file mode 100644
index 00000000000000..a01f6b11a9a995
--- /dev/null
+++ b/src/transformers/models/donut/__init__.py
@@ -0,0 +1,76 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_donut_swin": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutSwinConfig"],
+    "processing_donut": ["DonutProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_donut_swin"] = [
+        "DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "DonutSwinModel",
+        "DonutSwinPreTrainedModel",
+    ]
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_donut"] = ["DonutFeatureExtractor"]
+
+
+if TYPE_CHECKING:
+    from .configuration_donut_swin import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutSwinConfig
+    from .processing_donut import DonutProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_donut_swin import (
+            DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DonutSwinModel,
+            DonutSwinPreTrainedModel,
+        )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_donut import DonutFeatureExtractor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
new file mode 100644
index 00000000000000..d3316bdc79f685
--- /dev/null
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Donut Swin Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "naver-clova-ix/donut-base": "https://huggingface.co/naver-clova-ix/donut-base/resolve/main/config.json",
+    # See all Donut models at https://huggingface.co/models?filter=donut-swin
+}
+
+
+class DonutSwinConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DonutSwinModel`]. It is used to instantiate a
+    Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Donut
+    [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        depths (`list(int)`, *optional*, defaults to [2, 2, 6, 2]):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to [3, 6, 12, 24]):
+            Number of attention heads in each layer of the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 7):
+            Size of windows.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to True):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        use_absolute_embeddings (`bool`, *optional*, defaults to False):
+            Whether or not to add absolute position embeddings to the patch embeddings.
+        patch_norm (`bool`, *optional*, defaults to True):
+            Whether or not to add layer normalization after patch embedding.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import DonutSwinConfig, DonutSwinModel
+
+    >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
+    >>> configuration = DonutSwinConfig()
+
+    >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
+    >>> model = DonutSwinModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "donut-swin"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.path_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
new file mode 100644
index 00000000000000..507f10cb776cf0
--- /dev/null
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -0,0 +1,234 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut"""
+
+import argparse
+
+import torch
+from datasets import load_dataset
+
+from donut import DonutModel
+from transformers import (
+    DonutFeatureExtractor,
+    DonutProcessor,
+    DonutSwinConfig,
+    DonutSwinModel,
+    MBartConfig,
+    MBartForCausalLM,
+    VisionEncoderDecoderModel,
+    XLMRobertaTokenizerFast,
+)
+
+
+def get_configs(model):
+    original_config = model.config
+
+    encoder_config = DonutSwinConfig(
+        image_size=original_config.input_size,
+        patch_size=4,
+        depths=original_config.encoder_layer,
+        num_heads=[4, 8, 16, 32],
+        window_size=original_config.window_size,
+        embed_dim=128,
+    )
+    decoder_config = MBartConfig(
+        is_decoder=True,
+        is_encoder_decoder=False,
+        add_cross_attention=True,
+        decoder_layers=original_config.decoder_layer,
+        max_position_embeddings=original_config.max_position_embeddings,
+        vocab_size=len(
+            model.decoder.tokenizer
+        ),  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
+        scale_embedding=True,
+        add_final_layer_norm=True,
+    )
+
+    return encoder_config, decoder_config
+
+
+def rename_key(name):
+    if "encoder.model" in name:
+        name = name.replace("encoder.model", "encoder")
+    if "decoder.model" in name:
+        name = name.replace("decoder.model", "decoder")
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
+    if "patch_embed.norm" in name:
+        name = name.replace("patch_embed.norm", "embeddings.norm")
+    if name.startswith("encoder"):
+        if "layers" in name:
+            name = "encoder." + name
+        if "attn.proj" in name:
+            name = name.replace("attn.proj", "attention.output.dense")
+        if "attn" in name and "mask" not in name:
+            name = name.replace("attn", "attention.self")
+        if "norm1" in name:
+            name = name.replace("norm1", "layernorm_before")
+        if "norm2" in name:
+            name = name.replace("norm2", "layernorm_after")
+        if "mlp.fc1" in name:
+            name = name.replace("mlp.fc1", "intermediate.dense")
+        if "mlp.fc2" in name:
+            name = name.replace("mlp.fc2", "output.dense")
+
+        if name == "encoder.norm.weight":
+            name = "encoder.layernorm.weight"
+        if name == "encoder.norm.bias":
+            name = "encoder.layernorm.bias"
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, model):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+            key_split = key.split(".")
+            layer_num = int(key_split[3])
+            block_num = int(key_split[5])
+            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
+
+            if "weight" in key:
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
+                ] = val[:dim, :]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
+                ] = val[dim : dim * 2, :]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
+                ] = val[-dim:, :]
+            else:
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
+                ] = val[:dim]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
+                ] = val[dim : dim * 2]
+                orig_state_dict[
+                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
+                ] = val[-dim:]
+        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
+            # HuggingFace implementation doesn't use attn_mask buffer
+            # and model doesn't use final LayerNorms for the encoder
+            pass
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+    # load original model
+    original_model = DonutModel.from_pretrained(model_name).eval()
+
+    # load HuggingFace model
+    encoder_config, decoder_config = get_configs(original_model)
+    encoder = DonutSwinModel(encoder_config)
+    decoder = MBartForCausalLM(decoder_config)
+    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
+    model.eval()
+
+    state_dict = original_model.state_dict()
+    new_state_dict = convert_state_dict(state_dict, model)
+    model.load_state_dict(new_state_dict)
+
+    # verify results on scanned document
+    dataset = load_dataset("hf-internal-testing/example-documents")
+    image = dataset["test"][0]["image"].convert("RGB")
+
+    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
+    feature_extractor = DonutFeatureExtractor(
+        do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
+    )
+    processor = DonutProcessor(feature_extractor, tokenizer)
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
+        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+        question = "When is the coffee break?"
+        task_prompt = task_prompt.replace("{user_input}", question)
+    elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip":
+        task_prompt = "<s_rvlcdip>"
+    elif model_name in [
+        "naver-clova-ix/donut-base-finetuned-cord-v1",
+        "naver-clova-ix/donut-base-finetuned-cord-v1-2560",
+    ]:
+        task_prompt = "<s_cord>"
+    elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
+        task_prompt = "s_cord-v2>"
+    elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket":
+        task_prompt = "<s_zhtrainticket>"
+    elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]:
+        # use a random prompt
+        task_prompt = "hello world"
+    else:
+        raise ValueError("Model name not supported")
+    prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[
+        "input_ids"
+    ]
+
+    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
+    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
+    assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3)
+
+    # verify encoder hidden states
+    original_last_hidden_state = original_model.encoder(pixel_values)
+    last_hidden_state = model.encoder(pixel_values).last_hidden_state
+    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
+
+    # verify decoder hidden states
+    original_logits = original_model(pixel_values, prompt_tensors, None).logits
+    logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits
+    assert torch.allclose(original_logits, logits, atol=1e-3)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
+        processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="naver-clova-ix/donut-base-finetuned-docvqa",
+        required=False,
+        type=str,
+        help="Name of the original model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        required=False,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the converted model and processor to the 🤗 hub.",
+    )
+
+    args = parser.parse_args()
+    convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/donut/feature_extraction_donut.py b/src/transformers/models/donut/feature_extraction_donut.py
new file mode 100644
index 00000000000000..09bf3a6ad1c157
--- /dev/null
+++ b/src/transformers/models/donut/feature_extraction_donut.py
@@ -0,0 +1,208 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Donut."""
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+from PIL import Image, ImageOps
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageFeatureExtractionMixin,
+    ImageInput,
+    is_torch_tensor,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DonutFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a Donut feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the shorter edge of the input to the minimum value of a certain `size`.
+        size (`Tuple(int)`, *optional*, defaults to [1920, 2560]):
+            Resize the shorter edge of the input to the minimum value of the given size. Should be a tuple of (width,
+            height). Only has an effect if `do_resize` is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
+            if `do_resize` is set to `True`.
+        do_thumbnail (`bool`, *optional*, defaults to `True`):
+            Whether to thumbnail the input to the given `size`.
+        do_align_long_axis (`bool`, *optional*, defaults to `False`):
+            Whether to rotate the input if the height is greater than width.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether or not to pad the input to `size`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize=True,
+        size=[1920, 2560],
+        resample=Image.BILINEAR,
+        do_thumbnail=True,
+        do_align_long_axis=False,
+        do_pad=True,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_thumbnail = do_thumbnail
+        self.do_align_long_axis = do_align_long_axis
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def rotate_image(self, image, size):
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
+        if (size[1] > size[0] and image.width > image.height) or (size[1] < size[0] and image.width < image.height):
+            image = self.rotate(image, angle=-90, expand=True)
+
+        return image
+
+    def thumbnail(self, image, size):
+        if not isinstance(image, Image.Image):
+            image = self.to_pil_image(image)
+
+        image.thumbnail((size[0], size[1]))
+
+        return image
+
+    def pad(self, image: Image.Image, size: Tuple[int, int], random_padding: bool = False) -> Image.Image:
+        delta_width = size[0] - image.width
+        delta_height = size[1] - image.height
+
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+
+        padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
+        return ImageOps.expand(image, padding)
+
+    def __call__(
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        random_padding=False,
+        **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s).
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            random_padding (`bool`, *optional*, defaults to `False`):
+                Whether to randomly pad the input to `size`.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+        """
+        # Input type checking for clearer error
+        valid_images = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        if not is_batched:
+            images = [images]
+
+        # transformations (rotating + resizing + thumbnailing + padding + normalization)
+        if self.do_align_long_axis:
+            images = [self.rotate_image(image, self.size) for image in images]
+        if self.do_resize and self.size is not None:
+            images = [
+                self.resize(image=image, size=min(self.size), resample=self.resample, default_to_square=False)
+                for image in images
+            ]
+        if self.do_thumbnail and self.size is not None:
+            images = [self.thumbnail(image=image, size=self.size) for image in images]
+        if self.do_pad and self.size is not None:
+            images = [self.pad(image=image, size=self.size, random_padding=random_padding) for image in images]
+        if self.do_normalize:
+            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
+
+        # return as BatchFeature
+        data = {"pixel_values": images}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
new file mode 100644
index 00000000000000..78e5cc81c19885
--- /dev/null
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -0,0 +1,941 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Donut Swin Transformer model.
+
+This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
+states."""
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_donut_swin import DonutSwinConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "DonutSwinConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]
+
+DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "naver-clova-ix/donut-base",
+    # See all Donut Swin models at https://huggingface.co/models?filter=donut
+]
+
+
+@dataclass
+# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->DonutSwin
+class DonutSwinEncoderOutput(ModelOutput):
+    """
+    DonutSwin encoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->DonutSwin
+class DonutSwinModelOutput(ModelOutput):
+    """
+    DonutSwin model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+            Average pooling of the last layer hidden-state.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# Copied from transformers.models.swin.modeling_swin.window_partition
+def window_partition(input_feature, window_size):
+    """
+    Partitions the given input into windows.
+    """
+    batch_size, height, width, num_channels = input_feature.shape
+    input_feature = input_feature.view(
+        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
+    )
+    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows
+
+
+# Copied from transformers.models.swin.modeling_swin.window_reverse
+def window_reverse(windows, window_size, height, width):
+    """
+    Merges windows to produce higher resolution features.
+    """
+    batch_size = math.floor(windows.shape[0] / (height * width / window_size / window_size))
+    windows = windows.view(batch_size, height // window_size, width // window_size, window_size, window_size, -1)
+    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, height, width, -1)
+    return windows
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->DonutSwin
+class DonutSwinEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config, use_mask_token=False):
+        super().__init__()
+
+        self.patch_embeddings = DonutSwinPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.patch_grid = self.patch_embeddings.grid_size
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+
+        if config.use_absolute_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
+        else:
+            self.position_embeddings = None
+
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> Tuple[torch.Tensor]:
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings, output_dimensions
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings
+class DonutSwinPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.embed_dim
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def maybe_pad(self, pixel_values, height, width):
+        if width % self.patch_size[1] != 0:
+            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        if height % self.patch_size[0] != 0:
+            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        return pixel_values
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        # pad the input to be divisible by self.patch_size, if needed
+        pixel_values = self.maybe_pad(pixel_values, height, width)
+        embeddings = self.projection(pixel_values)
+        _, _, height, width = embeddings.shape
+        output_dimensions = (height, width)
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, output_dimensions
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging
+class DonutSwinPatchMerging(nn.Module):
+    """
+    Patch Merging Layer.
+
+    Args:
+        input_resolution (`Tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
+
+        return input_feature
+
+    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be disible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
+
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+
+        return input_feature
+
+
+# Copied from transformers.models.swin.modeling_swin.drop_path
+def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinDropPath
+class DonutSwinDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->DonutSwin
+class DonutSwinSelfAttention(nn.Module):
+    def __init__(self, config, dim, num_heads):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        window_size = config.window_size
+        self.window_size = (
+            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
+        )
+
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
+        )
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        batch_size, dim, num_channels = hidden_states.shape
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+        relative_position_bias = relative_position_bias.view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )
+
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in DonutSwinModel forward() function)
+            mask_shape = attention_mask.shape[0]
+            attention_scores = attention_scores.view(
+                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+            )
+            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput
+class DonutSwinSelfOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->DonutSwin
+class DonutSwinAttention(nn.Module):
+    def __init__(self, config, dim, num_heads):
+        super().__init__()
+        self.self = DonutSwinSelfAttention(config, dim, num_heads)
+        self.output = DonutSwinSelfOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinIntermediate
+class DonutSwinIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinOutput
+class DonutSwinOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin
+class DonutSwinLayer(nn.Module):
+    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.shift_size = shift_size
+        self.window_size = config.window_size
+        self.input_resolution = input_resolution
+        self.set_shift_and_window_size(input_resolution)
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = DonutSwinAttention(config, dim, num_heads)
+        self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = DonutSwinIntermediate(config, dim)
+        self.output = DonutSwinOutput(config, dim)
+
+    def set_shift_and_window_size(self, input_resolution):
+        if min(input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(input_resolution)
+
+    def get_attn_mask(self, height, width):
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, height, width, 1))
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def maybe_pad(self, hidden_states, height, width):
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+        hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self.set_shift_and_window_size(input_dimensions)
+        height, width = input_dimensions
+        batch_size, _, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+        hidden_states = hidden_states.view(batch_size, height, width, channels)
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_hidden_states = hidden_states
+
+        # partition windows
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+        attn_mask = self.get_attn_mask(height_pad, width_pad)
+        if attn_mask is not None:
+            attn_mask = attn_mask.to(hidden_states_windows.device)
+
+        attention_outputs = self.attention(
+            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attention_windows = shifted_windows
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :].contiguous()
+
+        attention_windows = attention_windows.view(batch_size, height * width, channels)
+
+        hidden_states = shortcut + self.drop_path(attention_windows)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->DonutSwin
+class DonutSwinStage(nn.Module):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList(
+            [
+                DonutSwinLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(layer_outputs[0], input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, output_dimensions)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinEncoder with Swin->DonutSwin
+class DonutSwinEncoder(nn.Module):
+    def __init__(self, config, grid_size):
+        super().__init__()
+        self.num_layers = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        self.layers = nn.ModuleList(
+            [
+                DonutSwinStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=DonutSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, DonutSwinEncoderOutput]:
+        all_input_dimensions = ()
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange b (h w) c -> b c h w
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            output_dimensions = layer_outputs[1]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+            all_input_dimensions += (input_dimensions,)
+
+            if output_hidden_states:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange b (h w) c -> b c h w
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[2:]
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return DonutSwinEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->DonutSwin
+class DonutSwinPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DonutSwinConfig
+    base_model_prefix = "swin"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DonutSwinEncoder):
+            module.gradient_checkpointing = value
+
+
+SWIN_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`DonutSwinConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SWIN_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top.",
+    SWIN_START_DOCSTRING,
+)
+class DonutSwinModel(DonutSwinPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+        super().__init__(config)
+        self.config = config
+        self.num_layers = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
+
+        self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid)
+
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=DonutSwinModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, DonutSwinModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, len(self.config.depths))
+
+        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return DonutSwinModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
new file mode 100644
index 00000000000000..1b00d894bd0878
--- /dev/null
+++ b/src/transformers/models/donut/processing_donut.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Donut.
+"""
+import re
+import warnings
+from contextlib import contextmanager
+
+from ...processing_utils import ProcessorMixin
+
+
+class DonutProcessor(ProcessorMixin):
+    r"""
+    Constructs a Donut processor which wraps a Donut feature extractor and an XLMRoBERTa tokenizer into a single
+    processor.
+
+    [`DonutProcessor`] offers all the functionalities of [`DonutFeatureExtractor`] and
+    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
+    [`~DonutProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor ([`DonutFeatureExtractor`]):
+            An instance of [`DonutFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
+            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
+    """
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
+        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
+        [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        images = kwargs.pop("images", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            images = args[0]
+            args = args[1:]
+
+        if images is None and text is None:
+            raise ValueError("You need to specify either an `images` or `text` input to process.")
+
+        if images is not None:
+            inputs = self.feature_extractor(images, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif images is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
+        docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your images inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def token2json(self, tokens, is_inner_value=False):
+        """
+        Convert a (generated) token sequence into an ordered JSON format.
+        """
+        output = dict()
+
+        while tokens:
+            start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
+            if start_token is None:
+                break
+            key = start_token.group(1)
+            end_token = re.search(rf"</s_{key}>", tokens, re.IGNORECASE)
+            start_token = start_token.group()
+            if end_token is None:
+                tokens = tokens.replace(start_token, "")
+            else:
+                end_token = end_token.group()
+                start_token_escaped = re.escape(start_token)
+                end_token_escaped = re.escape(end_token)
+                content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
+                if content is not None:
+                    content = content.group(1).strip()
+                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
+                        value = self.token2json(content, is_inner_value=True)
+                        if value:
+                            if len(value) == 1:
+                                value = value[0]
+                            output[key] = value
+                    else:  # leaf nodes
+                        output[key] = []
+                        for leaf in content.split(r"<sep/>"):
+                            leaf = leaf.strip()
+                            if leaf in self.tokenizer.get_added_vocab() and leaf[0] == "<" and leaf[-2:] == "/>":
+                                leaf = leaf[1:-2]  # for categorical special tokens
+                            output[key].append(leaf)
+                        if len(output[key]) == 1:
+                            output[key] = output[key][0]
+
+                tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
+                if tokens[:6] == r"<sep/>":  # non-leaf nodes
+                    return [output] + self.token2json(tokens[6:], is_inner_value=True)
+
+        if len(output):
+            return [output] if is_inner_value else output
+        else:
+            return [] if is_inner_value else {"text_sequence": tokens}
diff --git a/src/transformers/models/vision_encoder_decoder/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
similarity index 100%
rename from src/transformers/models/vision_encoder_decoder/convert_trocr_unilm_to_pytorch.py
rename to src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index d636be655af284..96a93ecae942a7 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1682,6 +1682,23 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class DonutSwinModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DonutSwinPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 6b00f3b3d76d2b..2be4d045350176 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -71,6 +71,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class DonutFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class DPTFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 990f278b0d5066..3c3babd4037780 100644
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -105,6 +105,7 @@ def _generate_supported_model_class_names(
     "deberta",
     "deberta-v2",
     "distilbert",
+    "donut-swin",
     "electra",
     "gpt2",
     "gpt_neo",
diff --git a/tests/models/donut/__init__.py b/tests/models/donut/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/donut/test_feature_extraction_donut.py b/tests/models/donut/test_feature_extraction_donut.py
new file mode 100644
index 00000000000000..38ccbf2075a9b1
--- /dev/null
+++ b/tests/models/donut/test_feature_extraction_donut.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DonutFeatureExtractor
+
+
+class DonutFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=[20, 18],
+        do_thumbnail=True,
+        do_align_axis=False,
+        do_pad=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_thumbnail = do_thumbnail
+        self.do_align_axis = do_align_axis
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_thumbnail": self.do_thumbnail,
+            "do_align_long_axis": self.do_align_axis,
+            "do_pad": self.do_pad,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+
+@require_torch
+@require_vision
+class DonutFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = DonutFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = DonutFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "do_thumbnail"))
+        self.assertTrue(hasattr(feature_extractor, "do_align_long_axis"))
+        self.assertTrue(hasattr(feature_extractor, "do_pad"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size[1],
+                self.feature_extract_tester.size[0],
+            ),
+        )
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
new file mode 100644
index 00000000000000..f909d961880a97
--- /dev/null
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -0,0 +1,464 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Donut Swin model. """
+
+import collections
+import inspect
+import os
+import pickle
+import tempfile
+import unittest
+
+from transformers import DonutSwinConfig
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import is_torch_available, is_torch_fx_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import DonutSwinModel
+    from transformers.models.donut.modeling_donut_swin import DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST
+
+if is_torch_fx_available():
+    from transformers.utils.fx import symbolic_trace
+
+
+class DonutSwinModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=32,
+        patch_size=2,
+        num_channels=3,
+        embed_dim=16,
+        depths=[1, 2, 1],
+        num_heads=[2, 2, 4],
+        window_size=2,
+        mlp_ratio=2.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        is_training=True,
+        scope=None,
+        use_labels=True,
+        type_sequence_label_size=10,
+        encoder_stride=8,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.patch_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.is_training = is_training
+        self.scope = scope
+        self.use_labels = use_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.encoder_stride = encoder_stride
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return DonutSwinConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            embed_dim=self.embed_dim,
+            depths=self.depths,
+            num_heads=self.num_heads,
+            window_size=self.window_size,
+            mlp_ratio=self.mlp_ratio,
+            qkv_bias=self.qkv_bias,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            drop_path_rate=self.drop_path_rate,
+            hidden_act=self.hidden_act,
+            use_absolute_embeddings=self.use_absolute_embeddings,
+            path_norm=self.patch_norm,
+            layer_norm_eps=self.layer_norm_eps,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = DonutSwinModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
+        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class DonutSwinModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (DonutSwinModel,) if is_torch_available() else ()
+    fx_compatible = True
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = DonutSwinModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DonutSwinConfig, embed_dim=37)
+
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def create_and_test_config_common_properties(self):
+        return
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        # DonutSwin does not use inputs_embeds
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            expected_num_attentions = len(self.model_tester.depths)
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            window_size_squared = config.window_size**2
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            else:
+                # also another +1 for reshaped_hidden_states
+                added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), expected_num_attentions)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
+            )
+
+    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        hidden_states = outputs.hidden_states
+
+        expected_num_layers = getattr(
+            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
+        )
+        self.assertEqual(len(hidden_states), expected_num_layers)
+
+        # DonutSwin has a different seq_length
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        self.assertListEqual(
+            list(hidden_states[0].shape[-2:]),
+            [num_patches, self.model_tester.embed_dim],
+        )
+
+        reshaped_hidden_states = outputs.reshaped_hidden_states
+        self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
+
+        batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
+        reshaped_hidden_states = (
+            reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1)
+        )
+        self.assertListEqual(
+            list(reshaped_hidden_states.shape[-2:]),
+            [num_patches, self.model_tester.embed_dim],
+        )
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        image_size = (
+            self.model_tester.image_size
+            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
+            else (self.model_tester.image_size, self.model_tester.image_size)
+        )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+
+    def test_hidden_states_output_with_padding(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.patch_size = 3
+
+        image_size = (
+            self.model_tester.image_size
+            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
+            else (self.model_tester.image_size, self.model_tester.image_size)
+        )
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+
+        padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
+        padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DonutSwinModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if "embeddings" not in name and param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        if not is_torch_fx_available() or not self.fx_compatible:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.return_dict = False
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    labels = inputs.get("labels", None)
+                    input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
+                    if labels is not None:
+                        input_names.append("labels")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                else:
+                    input_names = ["input_ids", "attention_mask", "token_type_ids", "pixel_values"]
+
+                    labels = inputs.get("labels", None)
+                    start_positions = inputs.get("start_positions", None)
+                    end_positions = inputs.get("end_positions", None)
+                    if labels is not None:
+                        input_names.append("labels")
+                    if start_positions is not None:
+                        input_names.append("start_positions")
+                    if end_positions is not None:
+                        input_names.append("end_positions")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+
+            except RuntimeError as e:
+                self.fail(f"Couldn't trace module: {e}")
+
+            def flatten_output(output):
+                flatten = []
+                for x in output:
+                    if isinstance(x, (tuple, list)):
+                        flatten += flatten_output(x)
+                    elif not isinstance(x, torch.Tensor):
+                        continue
+                    else:
+                        flatten.append(x)
+                return flatten
+
+            model_output = flatten_output(model_output)
+            traced_output = flatten_output(traced_output)
+            num_outputs = len(model_output)
+
+            for i in range(num_outputs):
+                self.assertTrue(
+                    torch.allclose(model_output[i], traced_output[i]),
+                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
+                )
+
+            # Test that the model can be serialized and restored properly
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
+                try:
+                    with open(pkl_file_name, "wb") as f:
+                        pickle.dump(traced_model, f)
+                    with open(pkl_file_name, "rb") as f:
+                        loaded = pickle.load(f)
+                except Exception as e:
+                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
+
+                loaded_output = loaded(**filtered_inputs)
+                loaded_output = flatten_output(loaded_output)
+
+                for i in range(num_outputs):
+                    self.assertTrue(
+                        torch.allclose(model_output[i], loaded_output[i]),
+                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
+                    )
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 320cdd63306262..7570888097c533 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -13,14 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import re
 import tempfile
 import unittest
 
 from datasets import load_dataset
 from packaging import version
 
-from transformers.testing_utils import require_torch, require_vision, slow, to_2tuple, torch_device
+from transformers import DonutProcessor, TrOCRProcessor
+from transformers.testing_utils import (
+    require_sentencepiece,
+    require_torch,
+    require_vision,
+    slow,
+    to_2tuple,
+    torch_device,
+)
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
@@ -54,7 +62,7 @@
     import PIL
     from PIL import Image
 
-    from transformers import TrOCRProcessor, ViTFeatureExtractor
+    from transformers import ViTFeatureExtractor
 
 
 @require_torch
@@ -654,8 +662,8 @@ def default_processor(self):
     def test_inference_handwritten(self):
         model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device)
 
-        ds = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
-        image = Image.open(ds[0]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
+        image = Image.open(dataset[0]["file"]).convert("RGB")
 
         processor = self.default_processor
         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@@ -679,8 +687,8 @@ def test_inference_handwritten(self):
     def test_inference_printed(self):
         model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device)
 
-        ds = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
-        image = Image.open(ds[1]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
+        image = Image.open(dataset[1]["file"]).convert("RGB")
 
         processor = self.default_processor
         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@@ -774,3 +782,197 @@ def generate_step(pixel_values):
         # should produce
         # ["a cat laying on top of a couch next to another cat"]
         self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
+
+
+@require_vision
+@require_torch
+@require_sentencepiece
+class DonutModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_docvqa(self):
+        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa").to(
+            torch_device
+        )
+
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[0]["image"]
+
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        decoder_input_ids = processor.tokenizer(
+            "<s_docvqa>", add_special_tokens=False, return_tensors="pt"
+        ).input_ids.to(torch_device)
+
+        # step 1: single forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
+            logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size([1, 1, 57532])
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([24.2731, -6.4522, 32.4130]).to(torch_device)
+        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
+
+        # step 2: generation
+        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+        question = "When is the coffee break?"
+        prompt = task_prompt.replace("{user_input}", question)
+        decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        outputs = model.generate(
+            pixel_values,
+            decoder_input_ids=decoder_input_ids,
+            max_length=model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=processor.tokenizer.pad_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[processor.tokenizer.unk_token_id]],
+            output_scores=True,
+            return_dict_in_generate=True,
+        )
+        sequence = processor.batch_decode(outputs.sequences)[0]
+        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+
+        # verify generated sequence
+        self.assertEqual(
+            sequence, "<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>"
+        )
+
+        # verify scores
+        self.assertEqual(len(outputs.scores), 11)
+        self.assertTrue(
+            torch.allclose(
+                outputs.scores[0][0, :3], torch.tensor([5.3153, -3.5276, 13.4781], device=torch_device), atol=1e-4
+            )
+        )
+
+    @slow
+    def test_inference_cordv2(self):
+        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2").to(
+            torch_device
+        )
+
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[2]["image"]
+
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        decoder_input_ids = processor.tokenizer(
+            "<s_cord-v2>", add_special_tokens=False, return_tensors="pt"
+        ).input_ids.to(torch_device)
+
+        # step 1: single forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
+            logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-27.4344, -3.2686, -19.3524], device=torch_device)
+        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
+
+        # step 2: generation
+        task_prompt = "<s_cord-v2>"
+        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        outputs = model.generate(
+            pixel_values,
+            decoder_input_ids=decoder_input_ids,
+            max_length=model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=processor.tokenizer.pad_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[processor.tokenizer.unk_token_id]],
+            output_scores=True,
+            return_dict_in_generate=True,
+        )
+
+        sequence = processor.batch_decode(outputs.sequences)[0]
+        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+
+        # verify generated sequence
+        # fmt: off
+        expected_sequence = "<s_menu><s_nm> CINNAMON SUGAR</s_nm><s_unitprice> 17,000</s_unitprice><s_cnt> 1 x</s_cnt><s_price> 17,000</s_price></s_menu><s_sub_total><s_subtotal_price> 17,000</s_subtotal_price></s_sub_total><s_total><s_total_price> 17,000</s_total_price><s_cashprice> 20,000</s_cashprice><s_changeprice> 3,000</s_changeprice></s_total>"  # noqa: E231
+        # fmt: on
+        self.assertEqual(sequence, expected_sequence)
+
+        # verify scores
+        self.assertEqual(len(outputs.scores), 43)
+        self.assertTrue(
+            torch.allclose(
+                outputs.scores[0][0, :3], torch.tensor([-27.4344, -3.2686, -19.3524], device=torch_device), atol=1e-4
+            )
+        )
+
+    @slow
+    def test_inference_rvlcdip(self):
+        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip").to(
+            torch_device
+        )
+
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[1]["image"]
+
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+
+        # step 1: single forward pass
+        decoder_input_ids = processor.tokenizer(
+            "<s_rvlcdip>", add_special_tokens=False, return_tensors="pt"
+        ).input_ids.to(torch_device)
+        with torch.no_grad():
+            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
+            logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1, model.decoder.config.vocab_size))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-17.6490, -4.8381, -15.7577], device=torch_device)
+        self.assertTrue(torch.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
+
+        # step 2: generation
+        task_prompt = "<s_rvlcdip>"
+        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        outputs = model.generate(
+            pixel_values,
+            decoder_input_ids=decoder_input_ids,
+            max_length=model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=processor.tokenizer.pad_token_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[processor.tokenizer.unk_token_id]],
+            output_scores=True,
+            return_dict_in_generate=True,
+        )
+
+        sequence = processor.batch_decode(outputs.sequences)[0]
+        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+
+        # verify generated sequence
+        self.assertEqual(sequence, "<s_class><advertisement/></s_class>")
+
+        # verify scores
+        self.assertEqual(len(outputs.scores), 4)
+        self.assertTrue(
+            torch.allclose(
+                outputs.scores[0][0, :3], torch.tensor([-17.6490, -4.8381, -15.7577], device=torch_device), atol=1e-4
+            )
+        )
diff --git a/utils/check_copies.py b/utils/check_copies.py
index e2e0e1a53e4332..7d571736544687 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -471,6 +471,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "Data2VecAudio": "Data2Vec",
     "Data2VecText": "Data2Vec",
     "Data2VecVision": "Data2Vec",
+    "DonutSwin": "Donut",
     "Marian": "MarianMT",
     "OpenAI GPT-2": "GPT-2",
     "OpenAI GPT": "GPT",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index d2271e87ebf178..254467113d6cb4 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -206,6 +206,7 @@
         ("data2vec-text", "data2vec"),
         ("data2vec-audio", "data2vec"),
         ("data2vec-vision", "data2vec"),
+        ("donut-swin", "donut"),
     ]
 )
 
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 1941a7343a6bc9..0edda8ae5a4c3c 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -8,6 +8,7 @@ docs/source/en/model_doc/t5.mdx
 docs/source/en/model_doc/t5v1.1.mdx
 docs/source/en/model_doc/byt5.mdx
 docs/source/en/model_doc/tapex.mdx
+docs/source/en/model_doc/donut.mdx
 docs/source/en/model_doc/encoder-decoder.mdx
 src/transformers/generation_utils.py
 src/transformers/models/albert/modeling_albert.py

From 089ad230a4c2f680d78283b096b518b006ff89bd Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Fri, 12 Aug 2022 18:52:49 +0200
Subject: [PATCH 130/162] Fix URLs (#18604)

Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
---
 docs/source/en/model_doc/donut.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.mdx
index 9c9973be022e7c..7f692f113cb5a3 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.mdx
@@ -33,7 +33,7 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
 Tips:
 
 - The quickest way to get started with Donut is by checking the [tutorial
-  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/donut), which show how to use the model
+  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut), which show how to use the model
   at inference time as well as fine-tuning on custom data.
 - Donut is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
 
@@ -188,7 +188,7 @@ See the [model hub](https://huggingface.co/models?filter=donut) to look for Donu
 
 ## Training
 
-We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/donut).
+We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut).
 
 ## DonutSwinConfig
 

From f1590b232909ce44425ef4cf397fad49c847943c Mon Sep 17 00:00:00 2001
From: Niklas Muennighoff <n.muennighoff@gmail.com>
Date: Fri, 12 Aug 2022 19:36:18 +0200
Subject: [PATCH 131/162] Update BLOOM parameter counts (#18531)

* Update BLOOM parameter counts

* Update BLOOM parameter counts
---
 docs/source/en/model_doc/bloom.mdx            | 10 +++---
 .../models/bloom/configuration_bloom.py       | 10 +++---
 .../models/bloom/modeling_bloom.py            | 12 +++----
 .../models/bloom/tokenization_bloom_fast.py   | 10 +++---
 tests/models/bloom/test_modeling_bloom.py     | 34 +++++++++----------
 tests/onnx/test_onnx_v2.py                    |  2 +-
 6 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/docs/source/en/model_doc/bloom.mdx b/docs/source/en/model_doc/bloom.mdx
index 79a45bd7bf1d48..cf415603d0fe8b 100644
--- a/docs/source/en/model_doc/bloom.mdx
+++ b/docs/source/en/model_doc/bloom.mdx
@@ -18,11 +18,11 @@ The BLOOM model has been proposed with its various versions through the [BigScie
 The architecture of BLOOM is essentially similar to GPT3 (auto-regressive model for next token prediction), but has been trained on 46 different languages and 13 programming languages.
 Several smaller versions of the models have been trained on the same dataset. BLOOM is available in the following versions:
 
-- [bloom-350m](https://huggingface.co/bigscience/bloom-350m)
-- [bloom-760m](https://huggingface.co/bigscience/bloom-760m)
-- [bloom-1b3](https://huggingface.co/bigscience/bloom-1b3)
-- [bloom-2b5](https://huggingface.co/bigscience/bloom-2b5)
-- [bloom-6b3](https://huggingface.co/bigscience/bloom-6b3)
+- [bloom-560m](https://huggingface.co/bigscience/bloom-560m)
+- [bloom-1b1](https://huggingface.co/bigscience/bloom-1b1)
+- [bloom-1b7](https://huggingface.co/bigscience/bloom-1b7)
+- [bloom-3b](https://huggingface.co/bigscience/bloom-3b)
+- [bloom-7b1](https://huggingface.co/bigscience/bloom-7b1)
 - [bloom](https://huggingface.co/bigscience/bloom) (176B parameters)
 
 
diff --git a/src/transformers/models/bloom/configuration_bloom.py b/src/transformers/models/bloom/configuration_bloom.py
index a33a6339b14e99..10acdcbc68e154 100644
--- a/src/transformers/models/bloom/configuration_bloom.py
+++ b/src/transformers/models/bloom/configuration_bloom.py
@@ -31,11 +31,11 @@
 
 BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "bigscience/bloom": "https://huggingface.co/bigscience/bloom/resolve/main/config.json",
-    "bigscience/bloom-350m": "https://huggingface.co/bigscience/bloom-350m/blob/main/config.json",
-    "bigscience/bloom-760m": "https://huggingface.co/bigscience/bloom-760m/blob/main/config.json",
-    "bigscience/bloom-1b3": "https://huggingface.co/bigscience/bloom-1b3/blob/main/config.json",
-    "bigscience/bloom-2b5": "https://huggingface.co/bigscience/bloom-2b5/blob/main/config.json",
-    "bigscience/bloom-6b3": "https://huggingface.co/bigscience/bloom-6b3/blob/main/config.json",
+    "bigscience/bloom-560m": "https://huggingface.co/bigscience/bloom-560m/blob/main/config.json",
+    "bigscience/bloom-1b1": "https://huggingface.co/bigscience/bloom-1b1/blob/main/config.json",
+    "bigscience/bloom-1b7": "https://huggingface.co/bigscience/bloom-1b7/blob/main/config.json",
+    "bigscience/bloom-3b": "https://huggingface.co/bigscience/bloom-3b/blob/main/config.json",
+    "bigscience/bloom-7b1": "https://huggingface.co/bigscience/bloom-7b1/blob/main/config.json",
 }
 
 
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index a33054a3835113..21eaded45b0c76 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -38,17 +38,17 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "bigscience/bloom-350m"
+_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
 _CONFIG_FOR_DOC = "BloomConfig"
 _TOKENIZER_FOR_DOC = "BloomTokenizerFast"
 
 BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "bigscience/bigscience-small-testing",
-    "bigscience/bloom-350m",
-    "bigscience/bloom-760m",
-    "bigscience/bloom-1b3",
-    "bigscience/bloom-2b5",
-    "bigscience/bloom-6b3",
+    "bigscience/bloom-560m",
+    "bigscience/bloom-1b1",
+    "bigscience/bloom-1b7",
+    "bigscience/bloom-3b",
+    "bigscience/bloom-7b1",
     "bigscience/bloom",
 ]
 
diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py
index 7c5f9b24072df7..1d6f405039a80b 100644
--- a/src/transformers/models/bloom/tokenization_bloom_fast.py
+++ b/src/transformers/models/bloom/tokenization_bloom_fast.py
@@ -36,11 +36,11 @@
 PRETRAINED_VOCAB_FILES_MAP = {
     "tokenizer_file": {
         "bigscience/tokenizer": "https://huggingface.co/bigscience/tokenizer/blob/main/tokenizer.json",
-        "bigscience/bloom-350m": "https://huggingface.co/bigscience/bloom-350m/blob/main/tokenizer.json",
-        "bigscience/bloom-760m": "https://huggingface.co/bigscience/bloom-760m/blob/main/tokenizer.json",
-        "bigscience/bloom-1b3": "https://huggingface.co/bigscience/bloom-1b3/blob/main/tokenizer.json",
-        "bigscience/bloom-2b5": "https://huggingface.co/bigscience/bloom-2b5/blob/main/tokenizer.json",
-        "bigscience/bloom-6b3": "https://huggingface.co/bigscience/bloom-2b5/blob/main/tokenizer.json",
+        "bigscience/bloom-560m": "https://huggingface.co/bigscience/bloom-560m/blob/main/tokenizer.json",
+        "bigscience/bloom-1b1": "https://huggingface.co/bigscience/bloom-1b1/blob/main/tokenizer.json",
+        "bigscience/bloom-1b7": "https://huggingface.co/bigscience/bloom-1b7/blob/main/tokenizer.json",
+        "bigscience/bloom-3b": "https://huggingface.co/bigscience/bloom-3b/blob/main/tokenizer.json",
+        "bigscience/bloom-7b1": "https://huggingface.co/bigscience/bloom-7b1/blob/main/tokenizer.json",
         "bigscience/bloom": "https://huggingface.co/bigscience/bloom/blob/main/tokenizer.json",
     },
 }
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 4570cb767326c0..12f66b63a837a3 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -379,27 +379,27 @@ def test_model_from_pretrained(self):
     def test_simple_generation(self):
         # This test is a bit flaky. For some GPU architectures, pytorch sets by default allow_fp16_reduced_precision_reduction = True and some operations
         # do not give the same results under this configuration, especially torch.baddmm and torch.bmm. https://pytorch.org/docs/stable/notes/numerical_accuracy.html#fp16-on-mi200
-        # As we leave the default value (True) for allow_fp16_reduced_precision_reduction , the tests failed when running in half-precision with smaller models (350m)
+        # As we leave the default value (True) for allow_fp16_reduced_precision_reduction , the tests failed when running in half-precision with smaller models (560m)
         # Please see: https://pytorch.org/docs/stable/notes/cuda.html#reduced-precision-reduction-in-fp16-gemms
         # This discrepancy is observed only when using small models and seems to be stable for larger models.
         # Our conclusion is that these operations are flaky for small inputs but seems to be stable for larger inputs (for the functions `baddmm` and `bmm`), and therefore for larger models.
 
         # Here is a summary of an ablation study of our observations
         # EXPECTED_OUTPUT = "I enjoy walking with my cute dog, and I love to watch the kids play. I am a very active person, and I am a very good listener. I am a very good person, and I am a very good person. I am a"
-        # 350m + allow_fp16_reduced_precision_reduction = False  + torch.bmm  ==> PASS
-        # 350m + allow_fp16_reduced_precision_reduction = False  + torch.baddm  ==> PASS
-        # 350m + allow_fp16_reduced_precision_reduction = True  + torch.baddm  ==> PASS
-        # 350m + allow_fp16_reduced_precision_reduction = True  + torch.bmm  ==> FAIL
+        # 560m + allow_fp16_reduced_precision_reduction = False  + torch.bmm  ==> PASS
+        # 560m + allow_fp16_reduced_precision_reduction = False  + torch.baddm  ==> PASS
+        # 560m + allow_fp16_reduced_precision_reduction = True  + torch.baddm  ==> PASS
+        # 560m + allow_fp16_reduced_precision_reduction = True  + torch.bmm  ==> FAIL
 
         # EXPECTED_OUTPUT = "I enjoy walking with my cute dog, but I also enjoy hiking, biking, and swimming. I love to cook and bake. I love to cook and bake. I love to cook and bake. I love to cook and bake. I love"
-        # >=760m + allow_fp16_reduced_precision_reduction = True  + torch.baddm  ==> PASS  (for use_cache=True and use_cache=False)
-        # >=760m + allow_fp16_reduced_precision_reduction = True  + torch.bmm  ==> PASS
-        # >=760m + allow_fp16_reduced_precision_reduction = False  + torch.bmm  ==> PASS
+        # >=1b1 + allow_fp16_reduced_precision_reduction = True  + torch.baddm  ==> PASS  (for use_cache=True and use_cache=False)
+        # >=1b1 + allow_fp16_reduced_precision_reduction = True  + torch.bmm  ==> PASS
+        # >=1b1 + allow_fp16_reduced_precision_reduction = False  + torch.bmm  ==> PASS
 
-        path_350m = "bigscience/bloom-350m"
-        model = BloomForCausalLM.from_pretrained(path_350m, use_cache=True, revision="gs555750").cuda()
+        path_560m = "bigscience/bloom-560m"
+        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
         model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_350m)
+        tokenizer = BloomTokenizerFast.from_pretrained(path_560m)
 
         input_sentence = "I enjoy walking with my cute dog"
         # This output has been obtained using fp32 model on the huggingface DGX workstation - NVIDIA A100 GPU
@@ -416,10 +416,10 @@ def test_simple_generation(self):
     @slow
     @require_torch_gpu
     def test_batch_generation(self):
-        path_350m = "bigscience/bloom-350m"
-        model = BloomForCausalLM.from_pretrained(path_350m, use_cache=True, revision="gs555750").cuda()
+        path_560m = "bigscience/bloom-560m"
+        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
         model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_350m, padding_side="left")
+        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
 
         input_sentence = ["I enjoy walking with my cute dog", "I enjoy walking with my cute dog"]
 
@@ -437,10 +437,10 @@ def test_batch_generation(self):
     @require_torch_gpu
     def test_batch_generation_padd(self):
 
-        path_350m = "bigscience/bloom-350m"
-        model = BloomForCausalLM.from_pretrained(path_350m, use_cache=True, revision="gs555750").cuda()
+        path_560m = "bigscience/bloom-560m"
+        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
         model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_350m, padding_side="left")
+        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
 
         input_sentence = ["I enjoy walking with my cute dog", "Hello my name is"]
         input_sentence_without_pad = "Hello my name is"
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index 5634abc7706856..79eff60cefed63 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -215,7 +215,7 @@ def test_values_override(self):
 }
 
 PYTORCH_EXPORT_WITH_PAST_MODELS = {
-    ("bloom", "bigscience/bloom-350m"),
+    ("bloom", "bigscience/bloom-560m"),
     ("gpt2", "gpt2"),
     ("gpt-neo", "EleutherAI/gpt-neo-125M"),
 }

From b2fe78b0c8d5c961f334cfe3de957ffa469d1a30 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 12 Aug 2022 10:49:59 -0700
Subject: [PATCH 132/162] [doc] fix anchors (#18591)

the manual anchors end up being duplicated with automatically added anchors and no longer work.
---
 docs/source/en/glossary.mdx | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/glossary.mdx b/docs/source/en/glossary.mdx
index b6cb2259d67da7..a61eb86eaab48d 100644
--- a/docs/source/en/glossary.mdx
+++ b/docs/source/en/glossary.mdx
@@ -44,7 +44,7 @@ specific language governing permissions and limitations under the License.
 Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
 detailed here alongside usage examples.
 
-<a id='input-ids'></a>
+
 
 ### Input IDs
 
@@ -113,7 +113,7 @@ we will see
 
 because this is the way a [`BertModel`] is going to expect its inputs.
 
-<a id='attention-mask'></a>
+
 
 ### Attention mask
 
@@ -171,7 +171,7 @@ in the dictionary returned by the tokenizer under the key "attention_mask":
 [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
 ```
 
-<a id='token-type-ids'></a>
+
 
 ### Token Type IDs
 
@@ -224,7 +224,7 @@ second sequence, corresponding to the "question", has all its tokens represented
 
 Some models, like [`XLNetModel`] use an additional token represented by a `2`.
 
-<a id='position-ids'></a>
+
 
 ### Position IDs
 
@@ -238,7 +238,7 @@ absolute positional embeddings.
 Absolute positional embeddings are selected in the range `[0, config.max_position_embeddings - 1]`. Some models use
 other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
 
-<a id='labels'></a>
+
 
 ### Labels
 
@@ -266,7 +266,7 @@ These labels are different according to the model head, for example:
 The base models (e.g., [`BertModel`]) do not accept labels, as these are the base transformer
 models, simply outputting features.
 
-<a id='decoder-input-ids'></a>
+
 
 ### Decoder input IDs
 
@@ -279,7 +279,6 @@ such models, passing the `labels` is the preferred way to handle training.
 
 Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
 
-<a id='feed-forward-chunking'></a>
 
 ### Feed Forward Chunking
 

From c9d8c70d69be36f3cd64a7a66557148dec03620f Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 12 Aug 2022 10:50:52 -0700
Subject: [PATCH 133/162] [fsmt] deal with -100 indices in decoder ids (#18592)

* [fsmt] deal with -100 indices in decoder ids

Fixes: https://github.com/huggingface/transformers/issues/17945

decoder ids get the default index -100, which breaks the model - like t5 and many other models add a fix to replace -100 with the correct pad index.

For some reason this use case hasn't been used with this model until recently - so this issue was there since the beginning it seems.

Any suggestions to how to add a simple test here? or perhaps we have something similar already? user's script is quite massive.

* style
---
 src/transformers/models/fsmt/modeling_fsmt.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index f469266d7454f6..8270a70e9ee927 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -372,6 +372,10 @@ def _check_shapes(shape_1, shape2):
 
 def shift_tokens_right(input_ids, pad_token_id):
     """Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""
+
+    # replace possible -100 values in labels by `pad_token_id`
+    input_ids.masked_fill_(input_ids == -100, pad_token_id)
+
     prev_output_tokens = input_ids.clone()
     index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
     prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()

From af92441dab2bc34b51005dea7a461ae43a354b15 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 12 Aug 2022 20:04:38 +0200
Subject: [PATCH 134/162] small change (#18584)

---
 tests/trainer/test_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9cdb02468b3034..f48265ffa58168 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -2398,7 +2398,7 @@ def test_bnb_adam8bit_no_bnb(self):
 
         # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
         # bnb will fail even if bnb is installed.
-        with patch.dict("sys.modules", {"bnb.optim": None}):
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
             with self.assertRaises(ValueError):
                 Trainer.get_optimizer_cls_and_kwargs(args)
 

From 2287492b680ed6ebcf301a2b07516e0b596bc45e Mon Sep 17 00:00:00 2001
From: Karim Foda <35491698+KMFODA@users.noreply.github.com>
Date: Sun, 14 Aug 2022 17:27:13 +0200
Subject: [PATCH 135/162] Flax Remat for LongT5 (#17994)

* [Flax] Add remat (gradient checkpointing)

* fix variable naming in test

* flip: checkpoint using a method

* fix naming

* fix class naming

* apply PVP's suggestions from code review

* add gradient_checkpointing to examples

* Add gradient_checkpointing to run_mlm_flax

* Add remat to longt5

* Add gradient checkpointing test longt5

* Fix args errors

* Fix remaining tests

* Make fixup & quality fixes

* replace kwargs

* remove unecessary kwargs

* Make fixup changes

* revert long_t5_flax changes

* Remove return_dict and copy to LongT5

* Remove test_gradient_checkpointing

Co-authored-by: sanchit-gandhi <sanchit@huggingface.co>
---
 .../flax/language-modeling/run_mlm_flax.py    |  9 ++
 .../summarization/run_summarization_flax.py   |  9 ++
 .../models/longt5/modeling_flax_longt5.py     | 75 +++++++++++----
 .../models/t5/modeling_flax_t5.py             | 95 +++++++++++++++----
 4 files changed, 149 insertions(+), 39 deletions(-)

diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index 65f6a2285d9c34..408e09fc111cb3 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -107,6 +107,12 @@ class TrainingArguments:
         default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
     )
     hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    gradient_checkpointing: bool = field(
+        default=False,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
 
     def __post_init__(self):
         if self.output_dir is not None:
@@ -640,6 +646,9 @@ def group_texts(examples):
             dtype=getattr(jnp, model_args.dtype),
         )
 
+    if training_args.gradient_checkpointing:
+        model.enable_gradient_checkpointing()
+
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
     train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index c193fe0bc3745a..2813c88a3bd6fd 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -121,6 +121,12 @@ class TrainingArguments:
         default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
     )
     hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    gradient_checkpointing: bool = field(
+        default=False,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
 
     def __post_init__(self):
         if self.output_dir is not None:
@@ -535,6 +541,9 @@ def main():
             dtype=getattr(jnp, model_args.dtype),
         )
 
+    if training_args.gradient_checkpointing:
+        model.enable_gradient_checkpointing()
+
     if model.config.decoder_start_token_id is None:
         raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
 
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index 766dc36888e228..224515cd12a200 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -25,6 +25,7 @@
 import jax.numpy as jnp
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax.random import PRNGKey
@@ -53,6 +54,8 @@
 _CONFIG_FOR_DOC = "LongT5Config"
 _TOKENIZER_FOR_DOC = "T5Tokenizer"
 
+remat = nn_partitioning.remat
+
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
 def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
@@ -1356,7 +1359,6 @@ def __call__(
         encoder_attention_mask=None,
         encoder_decoder_position_bias=None,
         output_attentions=False,
-        return_dict=True,
         deterministic=True,
         init_cache=False,
     ):
@@ -1377,13 +1379,31 @@ def __call__(
 class FlaxLongT5BlockCollection(nn.Module):
     config: LongT5Config
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def setup(self):
         self.causal = self.config.causal
-        self.blocks = [
-            FlaxLongT5LayerCollection(self.config, has_relative_attention_bias=(i == 0), dtype=self.dtype, name=str(i))
-            for i in range(self.config.num_layers)
-        ]
+        if self.gradient_checkpointing:
+            FlaxLongT5CheckpointLayer = remat(FlaxLongT5LayerCollection, static_argnums=(6, 7, 8))
+            self.blocks = [
+                FlaxLongT5CheckpointLayer(
+                    self.config,
+                    has_relative_attention_bias=(i == 0),
+                    dtype=self.dtype,
+                    name=str(i),
+                )
+                for i in range(self.config.num_layers)
+            ]
+        else:
+            self.blocks = [
+                FlaxLongT5LayerCollection(
+                    self.config,
+                    has_relative_attention_bias=(i == 0),
+                    dtype=self.dtype,
+                    name=str(i),
+                )
+                for i in range(self.config.num_layers)
+            ]
 
     def __call__(
         self,
@@ -1409,14 +1429,14 @@ def __call__(
 
             layer_outputs = layer_module(
                 hidden_states,
-                attention_mask=attention_mask,
-                position_bias=position_bias,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                output_attentions=output_attentions,
-                deterministic=deterministic,
-                init_cache=init_cache,
+                attention_mask,
+                position_bias,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                encoder_decoder_position_bias,
+                output_attentions,
+                deterministic,
+                init_cache,
             )
 
             hidden_states = layer_outputs[0]
@@ -1447,11 +1467,14 @@ class FlaxLongT5Stack(nn.Module):
     config: LongT5Config
     embed_tokens: nn.Embed
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def setup(self):
         self.causal = self.config.causal
 
-        self.block = FlaxLongT5BlockCollection(self.config, dtype=self.dtype)
+        self.block = FlaxLongT5BlockCollection(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
         self.final_layer_norm = FlaxLongT5LayerNorm(
             self.config.d_model, eps=self.config.layer_norm_epsilon, dtype=self.dtype
         )
@@ -1989,6 +2012,7 @@ def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, **kwargs
 class FlaxLongT5Module(nn.Module):
     config: LongT5Config
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def _get_encoder_module(self):
         return self.encoder
@@ -2005,12 +2029,22 @@ def setup(self):
 
         encoder_config = copy.deepcopy(self.config)
         encoder_config.causal = False
-        self.encoder = FlaxLongT5Stack(encoder_config, embed_tokens=self.shared, dtype=self.dtype)
+        self.encoder = FlaxLongT5Stack(
+            encoder_config,
+            embed_tokens=self.shared,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
 
         decoder_config = copy.deepcopy(self.config)
         decoder_config.causal = True
         decoder_config.num_layers = self.config.num_decoder_layers
-        self.decoder = FlaxLongT5Stack(decoder_config, embed_tokens=self.shared, dtype=self.dtype)
+        self.decoder = FlaxLongT5Stack(
+            decoder_config,
+            embed_tokens=self.shared,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
 
     def __call__(
         self,
@@ -2104,6 +2138,7 @@ class FlaxLongT5Model(FlaxLongT5PreTrainedModel):
 class FlaxLongT5ForConditionalGenerationModule(nn.Module):
     config: LongT5Config
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def _get_encoder_module(self):
         return self.encoder
@@ -2124,13 +2159,17 @@ def setup(self):
         encoder_config.causal = False
         encoder_config.use_cache = False
         encoder_config.is_encoder_decoder = False
-        self.encoder = FlaxLongT5Stack(encoder_config, self.shared, dtype=self.dtype)
+        self.encoder = FlaxLongT5Stack(
+            encoder_config, self.shared, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
 
         decoder_config = copy.deepcopy(self.config)
         decoder_config.causal = True
         decoder_config.is_encoder_decoder = False
         decoder_config.num_layers = self.config.num_decoder_layers
-        self.decoder = FlaxLongT5Stack(decoder_config, self.shared, dtype=self.dtype)
+        self.decoder = FlaxLongT5Stack(
+            decoder_config, self.shared, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
 
         self.lm_head = nn.Dense(
             self.config.vocab_size,
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 06ad5105429767..918a605fc4813a 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -25,6 +25,7 @@
 import jax.numpy as jnp
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax.random import PRNGKey
@@ -53,6 +54,8 @@
 _CONFIG_FOR_DOC = "T5Config"
 _TOKENIZER_FOR_DOC = "T5Tokenizer"
 
+remat = nn_partitioning.remat
+
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
 def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
@@ -622,7 +625,6 @@ def __call__(
         encoder_attention_mask=None,
         encoder_decoder_position_bias=None,
         output_attentions=False,
-        return_dict=True,
         deterministic=True,
         init_cache=False,
     ):
@@ -642,13 +644,31 @@ def __call__(
 class FlaxT5BlockCollection(nn.Module):
     config: T5Config
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def setup(self):
         self.causal = self.config.causal
-        self.blocks = [
-            FlaxT5LayerCollection(self.config, has_relative_attention_bias=(i == 0), dtype=self.dtype, name=str(i))
-            for i in range(self.config.num_layers)
-        ]
+        if self.gradient_checkpointing:
+            FlaxT5CheckpointLayer = remat(FlaxT5LayerCollection, static_argnums=(6, 7, 8))
+            self.blocks = [
+                FlaxT5CheckpointLayer(
+                    self.config,
+                    has_relative_attention_bias=(i == 0),
+                    dtype=self.dtype,
+                    name=str(i),
+                )
+                for i in range(self.config.num_layers)
+            ]
+        else:
+            self.blocks = [
+                FlaxT5LayerCollection(
+                    self.config,
+                    has_relative_attention_bias=(i == 0),
+                    dtype=self.dtype,
+                    name=str(i),
+                )
+                for i in range(self.config.num_layers)
+            ]
 
     def __call__(
         self,
@@ -674,14 +694,14 @@ def __call__(
 
             layer_outputs = layer_module(
                 hidden_states,
-                attention_mask=attention_mask,
-                position_bias=position_bias,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                output_attentions=output_attentions,
-                deterministic=deterministic,
-                init_cache=init_cache,
+                attention_mask,
+                position_bias,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                encoder_decoder_position_bias,
+                output_attentions,
+                deterministic,
+                init_cache,
             )
 
             hidden_states = layer_outputs[0]
@@ -711,11 +731,14 @@ class FlaxT5Stack(nn.Module):
     config: T5Config
     embed_tokens: nn.Embed
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def setup(self):
         self.causal = self.config.causal
 
-        self.block = FlaxT5BlockCollection(self.config, dtype=self.dtype)
+        self.block = FlaxT5BlockCollection(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
         self.final_layer_norm = FlaxT5LayerNorm(
             self.config.d_model, eps=self.config.layer_norm_epsilon, dtype=self.dtype
         )
@@ -919,11 +942,19 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
+        gradient_checkpointing: bool = False,
         **kwargs
     ):
-        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
 
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+
     def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
         # init input tensors
         input_ids = jnp.zeros(input_shape, dtype="i4")
@@ -1248,6 +1279,7 @@ def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, **kwargs
 class FlaxT5Module(nn.Module):
     config: T5Config
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def _get_encoder_module(self):
         return self.encoder
@@ -1264,12 +1296,22 @@ def setup(self):
 
         encoder_config = copy.deepcopy(self.config)
         encoder_config.causal = False
-        self.encoder = FlaxT5Stack(encoder_config, embed_tokens=self.shared, dtype=self.dtype)
+        self.encoder = FlaxT5Stack(
+            encoder_config,
+            embed_tokens=self.shared,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
 
         decoder_config = copy.deepcopy(self.config)
         decoder_config.causal = True
         decoder_config.num_layers = self.config.num_decoder_layers
-        self.decoder = FlaxT5Stack(decoder_config, embed_tokens=self.shared, dtype=self.dtype)
+        self.decoder = FlaxT5Stack(
+            decoder_config,
+            embed_tokens=self.shared,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
 
     def __call__(
         self,
@@ -1364,6 +1406,7 @@ class FlaxT5Model(FlaxT5PreTrainedModel):
 class FlaxT5EncoderModule(nn.Module):
     config: T5Config
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def setup(self):
         self.shared = nn.Embed(
@@ -1376,7 +1419,12 @@ def setup(self):
         encoder_config.is_decoder = False
         encoder_config.is_encoder_decoder = False
         encoder_config.causal = False
-        self.encoder = FlaxT5Stack(encoder_config, embed_tokens=self.shared, dtype=self.dtype)
+        self.encoder = FlaxT5Stack(
+            encoder_config,
+            embed_tokens=self.shared,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
 
     def __call__(
         self,
@@ -1384,7 +1432,7 @@ def __call__(
         attention_mask=None,
         output_attentions=False,
         output_hidden_states=False,
-        return_dict=True,
+        return_dict: bool = True,
         deterministic: bool = True,
     ):
 
@@ -1445,6 +1493,7 @@ def __call__(
 class FlaxT5ForConditionalGenerationModule(nn.Module):
     config: T5Config
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
 
     def _get_encoder_module(self):
         return self.encoder
@@ -1465,13 +1514,17 @@ def setup(self):
         encoder_config.causal = False
         encoder_config.use_cache = False
         encoder_config.is_encoder_decoder = False
-        self.encoder = FlaxT5Stack(encoder_config, self.shared, dtype=self.dtype)
+        self.encoder = FlaxT5Stack(
+            encoder_config, self.shared, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
 
         decoder_config = copy.deepcopy(self.config)
         decoder_config.causal = True
         decoder_config.is_encoder_decoder = False
         decoder_config.num_layers = self.config.num_decoder_layers
-        self.decoder = FlaxT5Stack(decoder_config, self.shared, dtype=self.dtype)
+        self.decoder = FlaxT5Stack(
+            decoder_config, self.shared, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
 
         self.lm_head = nn.Dense(
             self.config.vocab_size,

From c97b0856048b623c15cb2dfc2658bc4b9b819971 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Tue, 16 Aug 2022 16:34:51 +0530
Subject: [PATCH 136/162] mac m1 `mps` integration (#18598)

* mac m1 `mps` integration

* Update docs/source/en/main_classes/trainer.mdx

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* addressing comments

* Apply suggestions from code review

Co-authored-by: Dan Saattrup Nielsen <47701536+saattrupdan@users.noreply.github.com>

* resolve comment

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Dan Saattrup Nielsen <47701536+saattrupdan@users.noreply.github.com>
---
 docs/source/en/main_classes/trainer.mdx | 60 +++++++++++++++++++++++++
 src/transformers/training_args.py       | 53 +++++++++++++++++-----
 2 files changed, 103 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.mdx
index 44c9d1d4b01973..ab942a2c1a7de6 100644
--- a/docs/source/en/main_classes/trainer.mdx
+++ b/docs/source/en/main_classes/trainer.mdx
@@ -591,6 +591,66 @@ More details in this [issues](https://github.com/pytorch/pytorch/issues/75676).
 More details mentioned in this [issue](https://github.com/pytorch/pytorch/issues/76501)
 (`The original model parameters' .grads are not set, meaning that they cannot be optimized separately (which is why we cannot support multiple parameter groups)`).
 
+### Using Trainer for accelerated PyTorch Training on Mac 
+
+With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training. 
+This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.
+Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device. 
+This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.
+For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
+and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html). 
+
+<Tip warning={false}>
+
+We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine. 
+It has major fixes related to model correctness and performance improvements for transformer based models.
+Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details.
+
+</Tip>
+
+**Benefits of Training and Inference using Apple Silicon Chips**
+
+1. Enables users to train larger networks or batch sizes locally
+2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture. 
+Therefore, improving end-to-end performance.
+3. Reduces costs associated with cloud-based development or the need for additional local GPUs.
+
+**Pre-requisites**: To install torch with mps support, 
+please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).
+
+**Usage**:
+User has to just pass `--use_mps_device` argument. 
+For example, you can run the offical Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
+
+```bash
+export TASK_NAME=mrpc
+
+python examples/pytorch/text-classification/run_glue.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/ \
+  --use_mps_device \
+  --overwrite_output_dir
+```
+
+**A few caveats to be aware of**
+
+1. Some PyTorch operations have not been implemented in mps and will throw an error. 
+One way to get around that is to set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1`, 
+which will fallback to CPU for these operations. It still throws a UserWarning however.
+2. Distributed setups `gloo` and `nccl` are not working with `mps` device. 
+This means that currently only single GPU of `mps` device type can be used.
+
+Finally, please, remember that, 🤗 `Trainer` only integrates MPS backend, therefore if you
+have any problems or questions with regards to MPS backend usage, please, 
+file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
+
 Sections that were moved:
 
 [ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index e9a9f8f0043a79..7a23281d82ee21 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -22,6 +22,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+from packaging import version
+
 from .debug_utils import DebugOption
 from .trainer_utils import (
     EvaluationStrategy,
@@ -478,6 +480,8 @@ class TrainingArguments:
             are also available. See the [Ray documentation](
             https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for
             more options.
+        use_mps_device (`bool`, *optional*, defaults to `False`):
+            Whether to use Apple Silicon chip based `mps` device.
     """
 
     output_dir: str = field(
@@ -630,6 +634,9 @@ class TrainingArguments:
         },
     )
     no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
+    use_mps_device: bool = field(
+        default=False, metadata={"help": "Whether to use Apple Silicon chip based `mps` device."}
+    )
     seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
     data_seed: Optional[int] = field(default=None, metadata={"help": "Random seed to be used with data samplers."})
     jit_mode_eval: bool = field(
@@ -1368,16 +1375,42 @@ def _setup_devices(self) -> "torch.device":
             device = torch.device("cuda", self.local_rank)
             self._n_gpu = 1
         elif self.local_rank == -1:
-            # if n_gpu is > 1 we'll use nn.DataParallel.
-            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
-            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
-            # trigger an error that a device index is missing. Index 0 takes into account the
-            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
-            # will use the first GPU in that env, i.e. GPU#1
-            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-            # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
-            # the default value.
-            self._n_gpu = torch.cuda.device_count()
+            if self.use_mps_device:
+                if not torch.backends.mps.is_available():
+                    if not torch.backends.mps.is_built():
+                        raise AssertionError(
+                            "MPS not available because the current PyTorch install was not "
+                            "built with MPS enabled. Please install torch version >=1.12.0 on "
+                            "your Apple silicon Mac running macOS 12.3 or later with a native "
+                            "version (arm64) of Python"
+                        )
+                    else:
+                        raise AssertionError(
+                            "MPS not available because the current MacOS version is not 12.3+ "
+                            "and/or you do not have an MPS-enabled device on this machine."
+                        )
+                else:
+                    if not version.parse(version.parse(torch.__version__).base_version) > version.parse("1.12.0"):
+                        warnings.warn(
+                            "We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing)"
+                            " on your MacOS machine. It has major fixes related to model correctness and performance"
+                            " improvements for transformer based models. Please refer to"
+                            " https://github.com/pytorch/pytorch/issues/82707 for more details."
+                        )
+                    device = torch.device("mps")
+                    self._n_gpu = 1
+
+            else:
+                # if n_gpu is > 1 we'll use nn.DataParallel.
+                # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+                # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
+                # trigger an error that a device index is missing. Index 0 takes into account the
+                # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
+                # will use the first GPU in that env, i.e. GPU#1
+                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+                # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
+                # the default value.
+                self._n_gpu = torch.cuda.device_count()
         else:
             # Here, we'll use torch.distributed.
             # Initializes the distributed backend which will take care of synchronizing nodes/GPUs

From ea2c99243bddbcad3b15d00657f987212a36a77b Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 16 Aug 2022 13:41:37 +0200
Subject: [PATCH 137/162] Change scheduled CIs to use torch 1.12.1 (#18644)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile               | 2 +-
 docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 2 +-
 docker/transformers-pytorch-gpu/Dockerfile                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index b0a55ba8be946b..c1502651f64e38 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).
 
-ARG PYTORCH='1.12.0'
+ARG PYTORCH='1.12.1'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='1.11.0'
 # Example: `cu102`, `cu113`, etc.
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 843e5e2df5172f..2b3292f350d71d 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -3,7 +3,7 @@ LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
-ARG PYTORCH='1.12.0'
+ARG PYTORCH='1.12.1'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu113'
 
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index d7bb96e84ef69f..668bec3e715d86 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing]
 
 # If set to nothing, will install the latest version
-ARG PYTORCH='1.12.0'
+ARG PYTORCH='1.12.1'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 

From 771d6c03626d33376cba720a194f68272c28ff33 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 16 Aug 2022 13:53:47 +0200
Subject: [PATCH 138/162] Add checks for some workflow jobs (#18583)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/self-push.yml | 34 +++++++++++++++--
 utils/notification_service.py   | 67 ++++++++++++++++++++++++---------
 2 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index bb397bc8574829..d0efae8b479844 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -111,9 +111,24 @@ jobs:
           echo "::set-output name=matrix::$keys"
           echo "::set-output name=test_map::$test_map"
 
+  run_check_runners:
+    name: Check Runners
+    needs: setup
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
   run_tests_single_gpu:
     name: Model tests
-    needs: setup
+    needs: [setup, run_check_runners]
     # `dummy` means there is no test to run
     if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
     strategy:
@@ -198,7 +213,7 @@ jobs:
 
   run_tests_multi_gpu:
     name: Model tests
-    needs: setup
+    needs: [setup, run_check_runners]
     # `dummy` means there is no test to run
     if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
     strategy:
@@ -285,7 +300,7 @@ jobs:
 
   run_tests_torch_cuda_extensions_single_gpu:
     name: Torch CUDA extension tests
-    needs: setup
+    needs: [setup, run_check_runners]
     if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
     strategy:
       fail-fast: false
@@ -364,7 +379,7 @@ jobs:
 
   run_tests_torch_cuda_extensions_multi_gpu:
     name: Torch CUDA extension tests
-    needs: setup
+    needs: [setup, run_check_runners]
     if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
     strategy:
       fail-fast: false
@@ -447,12 +462,20 @@ jobs:
     if: always()
     needs: [
         setup,
+        run_check_runners,
         run_tests_single_gpu,
         run_tests_multi_gpu,
         run_tests_torch_cuda_extensions_single_gpu,
         run_tests_torch_cuda_extensions_multi_gpu
     ]
     steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Setup status: ${{ needs.setup.result }}"
+          echo "Runner status: ${{ needs.run_check_runners.result }}"
+
       # Necessary to get the correct branch name and commit SHA for `workflow_run` event
       # We also take into account the `push` event (we might want to test some changes in a branch)
       - name: Prepare custom environment variables
@@ -498,6 +521,9 @@ jobs:
           CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
           CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
           CI_SHA: ${{ env.CI_SHA }}
+          SETUP_STATUS: ${{ needs.setup.result }}
+          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
+
         # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
         run: |
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 4918b4a459ac38..9ed97236d46270 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -387,28 +387,52 @@ def payload(self) -> str:
         return json.dumps(blocks)
 
     @staticmethod
-    def error_out():
-        payload = [
-            {
-                "type": "section",
-                "text": {
-                    "type": "plain_text",
-                    "text": "There was an issue running the tests.",
-                },
-                "accessory": {
-                    "type": "button",
-                    "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
-                    "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
-                },
-            }
-        ]
+    def error_out(title, ci_title="", setup_failed=False, runner_failed=False):
+
+        blocks = []
+        title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
+        blocks.append(title_block)
+
+        if ci_title:
+            ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
+            blocks.append(ci_title_block)
+
+        if setup_failed:
+            text = "💔 Setup job failed. Tests are not run. 😭"
+        elif runner_failed:
+            text = "💔 CI runners have problems! Tests are not run. 😭"
+        else:
+            text = "💔 There was an issue running the tests. 😭"
+
+        error_block_1 = {
+            "type": "header",
+            "text": {
+                "type": "plain_text",
+                "text": text,
+            },
+        }
+        error_block_2 = {
+            "type": "section",
+            "text": {
+                "type": "plain_text",
+                "text": "🙏 Let's fix it ASAP! 🙏",
+            },
+            "accessory": {
+                "type": "button",
+                "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
+                "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
+            },
+        }
+        blocks.extend([error_block_1, error_block_2])
+
+        payload = json.dumps(blocks)
 
         print("Sending the following payload")
-        print(json.dumps({"blocks": json.loads(payload)}))
+        print(json.dumps({"blocks": blocks}))
 
         client.chat_postMessage(
             channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
-            text="There was an issue running the tests.",
+            text=text,
             blocks=payload,
         )
 
@@ -630,6 +654,11 @@ def prepare_reports(title, header, reports, to_truncate=True):
 
 if __name__ == "__main__":
 
+    setup_status = os.environ.get("SETUP_STATUS")
+    runner_status = os.environ.get("RUNNER_STATUS")
+    setup_failed = True if setup_status is not None and setup_status != "success" else False
+    runner_failed = True if runner_status is not None and runner_status != "success" else False
+
     org = "huggingface"
     repo = "transformers"
     repository_full_name = f"{org}/{repo}"
@@ -689,6 +718,10 @@ def prepare_reports(title, header, reports, to_truncate=True):
     else:
         ci_title = ""
 
+    if setup_failed or runner_failed:
+        Message.error_out(title, ci_title, setup_failed, runner_failed)
+        exit(0)
+
     arguments = sys.argv[1:][0]
     try:
         models = ast.literal_eval(arguments)

From b53ef289d434626625577d6db5e0e7a449ff4a8c Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 16 Aug 2022 13:30:52 +0100
Subject: [PATCH 139/162] TF: Fix generation repetition penalty with XLA
 (#18648)

---
 src/transformers/generation_tf_logits_process.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py
index 7b3f876212b867..f17ed04686860b 100644
--- a/src/transformers/generation_tf_logits_process.py
+++ b/src/transformers/generation_tf_logits_process.py
@@ -262,9 +262,11 @@ def _create_score_penalties(self, input_ids: tf.Tensor, logits: tf.Tensor) -> tf
 
         # Scatters the penalties
         token_penalties = tf.ones(logits.shape)
+        batch_size = input_ids.shape[0]
+        seq_len = tf.shape(input_ids)[1]  # the sequence length has dynamic size, hence the dynamic shape
         indexable_prev_input_ids = tf.concat(
             (
-                tf.expand_dims(tf.repeat(tf.range(input_ids.shape[0]), input_ids.shape[1]), axis=-1),
+                tf.expand_dims(tf.repeat(tf.range(batch_size), seq_len), axis=-1),
                 tf.expand_dims(tf.reshape(input_ids, [-1]), axis=-1),
             ),
             axis=1,

From 1769f662efacb7ea3fc0b1d3c7419993a230fc4c Mon Sep 17 00:00:00 2001
From: flozi00 <flozi00.fz@gmail.com>
Date: Tue, 16 Aug 2022 17:20:46 +0200
Subject: [PATCH 140/162] Update longt5.mdx (#18634)

---
 docs/source/en/model_doc/longt5.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/longt5.mdx b/docs/source/en/model_doc/longt5.mdx
index 27a1d68515847c..0e73d6c8ddff0e 100644
--- a/docs/source/en/model_doc/longt5.mdx
+++ b/docs/source/en/model_doc/longt5.mdx
@@ -37,7 +37,7 @@ Tips:
 - [`LongT5ForConditionalGeneration`] is an extension of [`T5ForConditionalGeneration`] exchanging the traditional
 encoder *self-attention* layer with efficient either *local* attention or *transient-global* (*tglobal*) attention.
 - Unlike the T5 model, LongT5 does not use a task prefix. Furthermore, it uses a different pre-training objective
-inspired by the pre-training of `[PegasusForConditionalGeneration]`.
+inspired by the pre-training of [`PegasusForConditionalGeneration`].
 - LongT5 model is designed to work efficiently and very well on long-range *sequence-to-sequence* tasks where the
 input sequence exceeds commonly used 512 tokens. It is capable of handling input sequences of a length up to 16,384 tokens.
 - For *Local Attention*, the sparse sliding-window local attention operation allows a given token to attend only `r`

From b2dc2f3739724c8920aa5a7ac4cd777276bb5761 Mon Sep 17 00:00:00 2001
From: zhoutang776 <47708118+zhoutang776@users.noreply.github.com>
Date: Tue, 16 Aug 2022 10:25:57 -0700
Subject: [PATCH 141/162] Update run_translation_no_trainer.py (#18637)

* Update run_translation_no_trainer.py

found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint

* fixs `no_decay` and `resume_step` issue

1. change `no_decay` list
2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1`
---
 .../pytorch/language-modeling/run_clm_no_trainer.py | 13 ++++++++++---
 .../pytorch/language-modeling/run_mlm_no_trainer.py | 11 +++++++++--
 .../translation/run_translation_no_trainer.py       | 13 ++++++++++---
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 3fd67d5fbf66e4..225b88a49440cc 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -464,7 +464,7 @@ def group_texts(examples):
 
     # Optimizer
     # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
+    no_decay = ["bias", "layer_norm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
@@ -558,10 +558,15 @@ def group_texts(examples):
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+    completed_steps = starting_epoch * num_update_steps_per_epoch
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
@@ -570,7 +575,9 @@ def group_texts(examples):
             # We need to skip steps until we reach the resumed step
             if args.resume_from_checkpoint and epoch == starting_epoch:
                 if resume_step is not None and step < resume_step:
-                    completed_steps += 1
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
+                        completed_steps += 1
                     continue
 
             with accelerator.accumulate(model):
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 80dfcf9a9194e5..c5f6aad4126f5a 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -602,10 +602,15 @@ def group_texts(examples):
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+    completed_steps = starting_epoch * num_update_steps_per_epoch
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
@@ -614,7 +619,9 @@ def group_texts(examples):
             # We need to skip steps until we reach the resumed step
             if args.resume_from_checkpoint and epoch == starting_epoch:
                 if resume_step is not None and step < resume_step:
-                    completed_steps += 1
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
+                        completed_steps += 1
                     continue
 
             with accelerator.accumulate(model):
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index a6b0988f63d090..34c2ad1964090f 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -510,7 +510,7 @@ def preprocess_function(examples):
 
     # Optimizer
     # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
+    no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
@@ -607,10 +607,15 @@ def postprocess_text(preds, labels):
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
 
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+    completed_steps = starting_epoch * num_update_steps_per_epoch
+
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
@@ -619,7 +624,9 @@ def postprocess_text(preds, labels):
             # We need to skip steps until we reach the resumed step
             if args.resume_from_checkpoint and epoch == starting_epoch:
                 if resume_step is not None and step < resume_step:
-                    completed_steps += 1
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
+                        completed_steps += 1
                     continue
             outputs = model(**batch)
             loss = outputs.loss

From ab9d3b444305293870dabdfb4ff8a078449c2d6f Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 17 Aug 2022 00:48:10 +0200
Subject: [PATCH 142/162] [bnb] Minor modifications (#18631)

* bnb minor modifications

- refactor documentation
- add troubleshooting README
- add PyPi library on DockerFile

* Apply suggestions from code review

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* put in one block

- put bash instructions in one block

* update readme

- refactor a bit hardware requirements

* change text a bit

* Apply suggestions from code review

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>

* apply suggestions

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>

* add link to paper

* Apply suggestions from code review

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* Update tests/mixed_int8/README.md

* Apply suggestions from code review

* refactor a bit

* add instructions Turing & Amperer

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* add A6000

* clarify a bit

* remove small part

* Update tests/mixed_int8/README.md

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile |   2 +-
 docs/source/en/main_classes/model.mdx         |  40 ------
 docs/source/en/perf_train_gpu_one.mdx         |  53 ++++++++
 tests/mixed_int8/README.md                    | 117 +++++++++++++++---
 4 files changed, 154 insertions(+), 58 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index c1502651f64e38..4db6f51826f02b 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -46,7 +46,7 @@ RUN python3 -m pip install -U "itsdangerous<2.1.0"
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
 # Add bitsandbytes for mixed int8 testing
-RUN python3 -m pip install -i https://test.pypi.org/simple/ bitsandbytes==0.31.5
+RUN python3 -m pip install --no-cache-dir bitsandbytes
 
 RUN python3 -m pip install --no-cache-dir decord
 
diff --git a/docs/source/en/main_classes/model.mdx b/docs/source/en/main_classes/model.mdx
index 10f81e55d74506..fd19b3db52b734 100644
--- a/docs/source/en/main_classes/model.mdx
+++ b/docs/source/en/main_classes/model.mdx
@@ -133,46 +133,6 @@ model = AutoModel.from_config(config)
 
 Due to Pytorch design, this functionality is only available for floating dtypes.
 
-### `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
-
-From the paper `GPT3.int8() : 8-bit Matrix Multiplication for Transformers at Scale`, we suport HuggingFace 🤗  integration for all models in the Hub with few lines of code. 
-For models trained in  half-precision (aka, either `float16` or `bfloat16`) or full precision. This method aims to reduce `nn.Linear` size by 2 (if trained in half precision) or by 4 if trained in full precision, without affecting too much quality by operating on the outliers in half-precision.
-This technique is useful and works well for billion scale models (>1B parameters) therefore we advice you to use it only for models of that scale. This method has been tested for 2-billion to 176-billion scale models and supports only PyTorch models. 
-
-![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
-
-Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models (>=176B parameters).
-Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but there are some exceptional systematic outliers that are very differently distributed for large models. These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6, but a lower threshold might be needed for more unstable models (small models, fine-tuning).
-
-Note also that you would require a GPU to run mixed-8bit models as the kernels has been compiled for GPUs only. Make sure that you have enough GPU RAM to store the quarter (or half if your model is natively in half precision) of the model before using this feature. 
-
-Below are some notes to help you use this module, or follow this demo on Google colab: [![Open In Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
-
-#### Requirements
-
-- Make sure you run that on a NVIDIA GPU that supports 8-bit tensor cores (Turing or Ampere GPUs - e.g. T4, RTX20s RTX30s, A40-A100). Note that previous generations of NVIDIA GPUs do not support 8-bit tensor cores.
-- Install the correct version of `bitsandbytes` by running:
-`pip install -i https://test.pypi.org/simple/ bitsandbytes`
-- Install `accelerate`:
-`pip install accelerate`
-
-#### Running mixed-int8 models
-
-After carefully installing the required libraries, the way to load your mixed 8-bit model is as follows:
-```py
-model_name = "bigscience/bloom-2b5"
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
-```
-The implementation supports multi-GPU setup thanks to `accelerate` as backend. If you want to control the GPU memory you want to allocate for each GPU, you can use the `max_memory` argument as follows:
-(If allocating `1GB` into GPU-0 and `2GB` into GPU-1, you can use `max_memory={0:"1GB", 1:"2GB"}`)
-```py
-max_memory_mapping = {0: "1GB", 1: "2GB"}
-model_name = "bigscience/bloom-3b"
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
-)
-```
-
 
 ## ModuleUtilsMixin
 
diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.mdx
index 56cd6c6f10e333..32748186a42fdf 100644
--- a/docs/source/en/perf_train_gpu_one.mdx
+++ b/docs/source/en/perf_train_gpu_one.mdx
@@ -733,3 +733,56 @@ This feature involves 3 different libraries. To install them, please follow the
 - [Torchdynamo installation](https://github.com/pytorch/torchdynamo#requirements-and-setup)  
 - [Functorch installation](https://github.com/pytorch/functorch#install)  
 - [Torch-TensorRT(FX) installation](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst#installation)  
+
+## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
+
+From the paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), we support HuggingFace integration for all models in the Hub with a few lines of code. 
+The method reduce `nn.Linear` size by 2 for `float16` and `bfloat16` weights and by 4 for `float32` weights, with close to no impact to the quality by operating on the outliers in half-precision.
+
+![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
+
+Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) a systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models.
+For more details regarding the method, check out the [paper](https://arxiv.org/abs/2208.07339) or our [blogpost about the integration](https://huggingface.co/blog/hf-bitsandbytes-integration).
+
+![MixedInt8.gif](https://s3.amazonaws.com/moonup/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
+
+Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature. 
+Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos).
+
+### Requirements
+
+- Make sure you run that on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100).
+- Install the correct version of `bitsandbytes` by running:
+`pip install bitsandbytes>=0.31.5`
+- Install `accelerate`
+`pip install accelerate>=0.12.0`
+
+### Running mixed-int8 models
+
+After installing the required libraries, the way to load your mixed 8-bit model is as follows:
+```py
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+The current implementation supports a multi-GPU setup when using `accelerate`. If you want to control the GPU memory you want to allocate for each GPU use the `max_memory` argument as follows:
+
+```py
+max_memory_mapping = {0: "1GB", 1: "2GB"}
+model_name = "bigscience/bloom-3b"
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
+)
+```
+
+In this example, the first GPU will use 1GB of memory and the second 2GB.
+
+### Colab demos
+
+With this method you can infer on models that were not possible to infer on a Google Colab before. 
+Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab:
+
+[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
+
+Or this demo for BLOOM-3B:
+
+[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
\ No newline at end of file
diff --git a/tests/mixed_int8/README.md b/tests/mixed_int8/README.md
index c0173bed7a6b7a..7a0f86dbb25639 100644
--- a/tests/mixed_int8/README.md
+++ b/tests/mixed_int8/README.md
@@ -1,37 +1,120 @@
 # Testing mixed int8 quantization
 
+![HFxbitsandbytes.png](https://s3.amazonaws.com/moonup/production/uploads/1660567705337-62441d1d9fdefb55a0b7d12c.png)
+
+The following is the recipe on how to effectively debug `bitsandbytes` integration on Hugging Face `transformers`.
+
+## Library requirements
+
++ `transformers>=4.22.0`
++ `accelerate>=0.12.0` 
++ `bitsandbytes>=0.31.5`.
 ## Hardware requirements
 
-I am using a setup of 2 GPUs that are NVIDIA-Tesla T4 15GB
+The following instructions are tested with 2 NVIDIA-Tesla T4 GPUs. To run successfully `bitsandbytes` you would need a 8-bit core tensor supported GPU. Note that Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100, A6000 should be supported. 
 
 ## Virutal envs
 
-```conda create --name int8-testing python==3.8```
-```git clone https://github.com/younesbelkada/transformers.git && git checkout integration-8bit```
-```pip install -e ".[dev]"```
-```pip install -i https://test.pypi.org/simple/ bitsandbytes```
-```pip install git+https://github.com/huggingface/accelerate.git@e0212893ea6098cc0a7a3c7a6eb286a9104214c1```
+```bash
+conda create --name int8-testing python==3.8
+pip install bitsandbytes>=0.31.5
+pip install accelerate>=0.12.0
+pip install transformers>=4.23.0
+```
+if `transformers>=4.23.0` is not released yet, then use:
+```
+pip install git+https://github.com/huggingface/transformers.git
+```
+
+## Troubleshooting
 
+A list of common errors:
 
-## Trobleshooting
+### Torch does not correctly do the operations on GPU
 
-```conda create --name int8-testing python==3.8```
-```pip install -i https://test.pypi.org/simple/ bitsandbytes```
-```conda install pytorch torchvision torchaudio -c pytorch```
-```git clone https://github.com/younesbelkada/transformers.git && git checkout integration-8bit```
-```pip install -e ".[dev]"```
-```pip install git+https://github.com/huggingface/accelerate.git@b52b793ea8bac108ba61192eead3cf11ca02433d```
+First check that:
 
-### Check driver settings:
+```py
+import torch
 
+vec = torch.randn(1, 2, 3).to(0)
 ```
-nvcc --version
+
+Works without any error. If not, install torch using `conda` like:
+
+```bash
+conda create --name int8-testing python==3.8
+conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
+pip install bitsandbytes>=0.31.5
+pip install accelerate>=0.12.0
+pip install transformers>=4.23.0
 ```
+For the latest pytorch instructions please see [this](https://pytorch.org/get-started/locally/)
+
+and the snippet above should work.
+
+### ` bitsandbytes operations are not supported under CPU!`
+
+This happens when some Linear weights are set to the CPU when using `accelerate`. Please check carefully `model.hf_device_map` and make sure that there is no `Linear` module that is assigned to CPU. It is fine to have the last module (usually the Lm_head) set on CPU.
+
+### `To use the type as a Parameter, please correct the detach() semantics defined by __torch_dispatch__() implementation.`
+
+Use the latest version of `accelerate` with a command such as: `pip install -U accelerate` and the problem should be solved.
+
+### `Parameter has no attribue .CB` 
+
+Same solution as above.
+
+### `RuntimeError: CUDA error: an illegal memory access was encountered ... consider passing CUDA_LAUNCH_BLOCKING=1`
+
+Run your script by pre-pending `CUDA_LAUNCH_BLOCKING=1` and you should observe an error as described in the next section.
+
+### `CUDA illegal memory error: an illegal memory access at line...`:
 
+Check the CUDA verisons with:
+```
+nvcc --version
+```
+and confirm it is the same version as the one detected by `bitsandbytes`. If not, run:
 ```
 ls -l $CONDA_PREFIX/lib/libcudart.so
 ```
+or 
+```
+ls -l $LD_LIBRARY_PATH
+```
+Check if `libcudart.so` has a correct symlink that is set. Sometimes `nvcc` detects the correct CUDA version but `bitsandbytes` doesn't. You have to make sure that the symlink that is set for the file `libcudart.so` is redirected to the correct CUDA file. 
+
+Here is an example of a badly configured CUDA installation:
+
+`nvcc --version` gives:
+
+![Screenshot 2022-08-15 at 15.12.23.png](https://s3.amazonaws.com/moonup/production/uploads/1660569220888-62441d1d9fdefb55a0b7d12c.png)
+
+which means that the detected CUDA version is 11.3 but `bitsandbytes` outputs:
+
+![image.png](https://s3.amazonaws.com/moonup/production/uploads/1660569284243-62441d1d9fdefb55a0b7d12c.png)
+
+First check:
+
+```bash
+echo $LD_LIBRARY_PATH
+```
+
+If this contains multiple paths separated by `:`. Then you have to make sure that the correct CUDA version is set. By doing:
+
+```bash
+ls -l $path/libcudart.so
+```
+
+On each path (`$path`) separated by `:`.
+If not, simply run
+```bash
+ls -l $LD_LIBRARY_PATH/libcudart.so
+```
+
+and you can see
 
-### Recurrent bugs
+![Screenshot 2022-08-15 at 15.12.33.png](https://s3.amazonaws.com/moonup/production/uploads/1660569176504-62441d1d9fdefb55a0b7d12c.png)
 
-Sometimes you have to run a "dummy" inference pass when dealing with a multi-GPU setup. Checkout the ```test_multi_gpu_loading``` and the ```test_pipeline``` functions.
\ No newline at end of file
+If you see that the file is linked to the wrong CUDA version (here 10.2), find the correct location for `libcudart.so` (`find --name libcudart.so`) and replace the environment variable `LD_LIBRARY_PATH` with the one containing the correct `libcudart.so` file.
\ No newline at end of file

From a316ea33f46c61705c9b2ddb5a2d500640052c8e Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 17 Aug 2022 09:50:57 +0200
Subject: [PATCH 143/162] Examples: add Bloom support for token classification
 (#18632)

* examples: add Bloom support for token classification (FLAX, PyTorch and TensorFlow)

* examples: remove support for Bloom in token classication (FLAX and TensorFlow currently have no support for it)
---
 examples/pytorch/token-classification/run_ner.py            | 2 +-
 examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 9000b5006e03fa..a272f25aa417ea 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -348,7 +348,7 @@ def get_label_list(labels):
     )
 
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
-    if config.model_type in {"gpt2", "roberta"}:
+    if config.model_type in {"bloom", "gpt2", "roberta"}:
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer_name_or_path,
             cache_dir=model_args.cache_dir,
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index f5736f35c79148..937abb718e72b1 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -398,7 +398,7 @@ def get_label_list(labels):
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
 
-    if config.model_type in {"gpt2", "roberta"}:
+    if config.model_type in {"bloom", "gpt2", "roberta"}:
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True)
     else:
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True)

From c6751eacd6408ab47a1eb5e6d1264f5bcf4bfd19 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 17 Aug 2022 10:04:49 +0200
Subject: [PATCH 144/162] Fix Yolos ONNX export test (#18606)

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/onnx/test_onnx_v2.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index 79eff60cefed63..b3c0ffb1f371b9 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -284,6 +284,12 @@ def _onnx_export(self, test_name, name, model_name, feature, onnx_config_class_c
         model_class = FeaturesManager.get_model_class_for_feature(feature)
         config = AutoConfig.from_pretrained(model_name)
         model = model_class.from_config(config)
+
+        # Dynamic axes aren't supported for YOLO-like models. This means they cannot be exported to ONNX on CUDA devices.
+        # See: https://github.com/ultralytics/yolov5/pull/8378
+        if model.__class__.__name__.startswith("Yolos") and device != "cpu":
+            return
+
         onnx_config = onnx_config_class_constructor(model.config)
 
         if is_torch_available():

From b7046bc84d5d0ba9dca79dbd417507002bf0abb4 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 17 Aug 2022 14:32:11 +0100
Subject: [PATCH 145/162] Fixup

---
 src/transformers/image_transforms.py |  1 +
 src/transformers/image_utils.py      | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index dd3d5bc240d40d..d265bd70966fd6 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -19,6 +19,7 @@
 
 from transformers.utils.import_utils import is_tf_available, is_torch_available, is_vision_available
 
+
 if is_vision_available():
     import PIL
 
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 4d291c7d489508..8131dfaebe9c59 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import os
-from typing import List, Tuple, Union
+from typing import TYPE_CHECKING, List, Tuple, Union
 
 import numpy as np
 
@@ -35,9 +35,14 @@
     import PIL.ImageOps
 
 
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+
+
 ImageInput = Union[
-    "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]  # noqa
-]
+    "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
+]  # noqa
 
 
 class ChannelDimension(ExplicitEnum):

From f8a6b87813d1dbe6ef80f1a4ca169235237d51a0 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 17 Aug 2022 14:48:21 +0100
Subject: [PATCH 146/162] Fix up

---
 src/transformers/image_transforms.py                    | 4 +++-
 src/transformers/models/glpn/feature_extraction_glpn.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index d265bd70966fd6..4a3197a2eba0d1 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from transformers.utils.import_utils import is_tf_available, is_torch_available, is_vision_available
+from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available
 
 
 if is_vision_available():
@@ -38,6 +38,8 @@
         import torch
     if is_tf_available():
         import tensorflow as tf
+    if is_flax_available():
+        import jax.numpy as jnp
 
 
 def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDimension, str]) -> np.ndarray:
diff --git a/src/transformers/models/glpn/feature_extraction_glpn.py b/src/transformers/models/glpn/feature_extraction_glpn.py
index 9bcd4d1b9d6c6d..fe63276c4798a6 100644
--- a/src/transformers/models/glpn/feature_extraction_glpn.py
+++ b/src/transformers/models/glpn/feature_extraction_glpn.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """Feature extractor class for GLPN."""
 
-from .image_processing_glpn import GLPNImageProcessor
 from ...utils import logging
+from .image_processing_glpn import GLPNImageProcessor
 
 
 logger = logging.get_logger(__name__)

From a37bce327d3da45c7cc251374b3d7161cd09df10 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 17 Aug 2022 15:32:38 +0100
Subject: [PATCH 147/162] Move PIL default arguments inside function for safe
 imports

---
 src/transformers/image_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 8131dfaebe9c59..cc4b60080f8056 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -315,7 +315,7 @@ def normalize(self, image, mean, std):
         else:
             return (image - mean) / std
 
-    def resize(self, image, size, resample=PIL.Image.BILINEAR, default_to_square=True, max_size=None):
+    def resize(self, image, size, resample=None, default_to_square=True, max_size=None):
         """
         Resizes `image`. Enforces conversion of input to PIL.Image.
 
@@ -345,6 +345,8 @@ def resize(self, image, size, resample=PIL.Image.BILINEAR, default_to_square=Tru
         Returns:
             image: A resized `PIL.Image.Image`.
         """
+        resample = resample if resample is not None else PIL.Image.BILINEAR
+
         self._ensure_format_supported(image)
 
         if not isinstance(image, PIL.Image.Image):
@@ -472,7 +474,7 @@ def flip_channel_order(self, image):
 
         return image[::-1, :, :]
 
-    def rotate(self, image, angle, resample=PIL.Image.NEAREST, expand=0, center=None, translate=None, fillcolor=None):
+    def rotate(self, image, angle, resample=None, expand=0, center=None, translate=None, fillcolor=None):
         """
         Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
         counter clockwise around its centre.
@@ -485,6 +487,8 @@ def rotate(self, image, angle, resample=PIL.Image.NEAREST, expand=0, center=None
         Returns:
             image: A rotated `PIL.Image.Image`.
         """
+        resample = resample if resample is not None else PIL.Image.NEAREST
+
         self._ensure_format_supported(image)
 
         if not isinstance(image, PIL.Image.Image):

From 6ec9dbb2fe72013bb970c3433431d90ec2ed47ed Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 17 Aug 2022 18:25:06 +0100
Subject: [PATCH 148/162] Add image utils to toctree

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 78137d2c8a74c1..bac34ccf1c4b0e 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -475,6 +475,8 @@
       title: Utilities for Trainer
     - local: internal/generation_utils
       title: Utilities for Generation
+    - local: internal/image_processing_utils
+      title: Utilities for Image Processors
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers

From 76936006af15b045968cfe583bfb7b40f5c55820 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 18 Aug 2022 12:30:26 +0100
Subject: [PATCH 149/162] Update `rescale` method to reflect changes in #18677

---
 src/transformers/image_transforms.py                  | 4 ++--
 src/transformers/models/glpn/image_processing_glpn.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 4a3197a2eba0d1..e58d08953bc69c 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -69,7 +69,7 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
     raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
 
 
-def rescale(
+def rescale_image(
     image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None, dtype=np.float32
 ) -> np.ndarray:
     """
@@ -127,7 +127,7 @@ def to_pil_image(
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
     do_rescale = isinstance(image.flat[0], float) if do_rescale is None else do_rescale
     if do_rescale:
-        image = rescale(image, 255)
+        image = rescale_image(image, 255)
     image = image.astype(np.uint8)
     return PIL.Image.fromarray(image)
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 15605781d11a17..aeda45a2286642 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -22,7 +22,7 @@
 from transformers.utils.generic import TensorType
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_transforms import rescale_image, resize, to_channel_dimension_format
 from ...image_utils import ChannelDimension, get_image_size, is_batched, to_numpy_array, valid_images
 from ...utils import logging
 
@@ -93,7 +93,7 @@ def resize(
         image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
         return image
 
-    def rescale(
+    def rescale_image(
         self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs
     ) -> np.ndarray:
         """
@@ -110,7 +110,7 @@ def rescale(
                 - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
+        return rescale_image(image=image, scale=scale, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
@@ -172,7 +172,7 @@ def preprocess(
             images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
 
         if do_rescale:
-            images = [self.rescale(image, scale=1 / 255) for image in images]
+            images = [self.rescale_image(image, scale=1 / 255) for image in images]
 
         images = [to_channel_dimension_format(image, data_format) for image in images]
 

From 464a4f297deaecff838a3e6b35dc83c1a1debe1b Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 23 Aug 2022 18:17:04 +0100
Subject: [PATCH 150/162] Update
 docs/source/en/internal/image_processing_utils.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/internal/image_processing_utils.mdx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index 8bdf0ed11099b2..1ec890e9e1f786 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -25,8 +25,6 @@ Most of those are only useful if you are studying the code of the image processo
 
 [[autodoc]] image_transforms.to_pil_image
 
-
-
 ## ImageProcessorMixin
 
 [[autodoc]] image_processing_utils.ImageProcessorMixin

From 713e9588cbc44b50f7ae09c88388c2c8dc160757 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 23 Aug 2022 20:52:14 +0100
Subject: [PATCH 151/162] Address Niels PR comments

---
 src/transformers/image_processing_utils.py    |  3 +--
 src/transformers/image_transforms.py          | 27 ++++++++++++-------
 .../models/glpn/image_processing_glpn.py      |  8 +++---
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 721fc86f0ec52f..ba9d3c0962e3f6 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -31,8 +31,7 @@ class BatchFeature(BaseBatchFeature):
 
     Args:
         data (`dict`):
-            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask',
-            etc.).
+            Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
         tensor_type (`Union[None, str, TensorType]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
             initialization.
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index e58d08953bc69c..5c88efb6601340 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -48,28 +48,32 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
 
     Args:
         image (`numpy.ndarray`):
-            The image to convert to the PIL Image format.
+            The image to have its channel dimension set.
         channel_dim (`ChannelDimension`):
             The channel dimension format to use.
 
     Returns:
         image: A converted np.ndarray.
     """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
     current_channel_dim = infer_channel_dimension_format(image)
     target_channel_dim = ChannelDimension(channel_dim)
     if current_channel_dim == target_channel_dim:
         return image
 
     if target_channel_dim == ChannelDimension.FIRST:
-        return image.transpose((2, 0, 1))
-
-    if target_channel_dim == ChannelDimension.LAST:
-        return image.transpose((1, 2, 0))
+        image = image.transpose((2, 0, 1))
+    elif target_channel_dim == ChannelDimension.LAST:
+        image = image.transpose((1, 2, 0))
+    else:
+        raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
 
-    raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
+    return image
 
 
-def rescale_image(
+def rescale(
     image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None, dtype=np.float32
 ) -> np.ndarray:
     """
@@ -89,6 +93,9 @@ def rescale_image(
     Returns:
         image: A rescaled np.ndarray image.
     """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
     rescaled_image = image * scale
     if data_format is not None:
         rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
@@ -113,12 +120,12 @@ def to_pil_image(
     if isinstance(image, PIL.Image.Image):
         return image
 
+    # Convert all tensors to numpy arrays before converting to PIL image
     if is_torch_tensor(image) or is_tf_tensor(image):
         image = image.numpy()
     elif is_jax_tensor(image):
         image = np.array(image)
-
-    if not isinstance(image, np.ndarray):
+    elif not isinstance(image, np.ndarray):
         raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
@@ -127,7 +134,7 @@ def to_pil_image(
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
     do_rescale = isinstance(image.flat[0], float) if do_rescale is None else do_rescale
     if do_rescale:
-        image = rescale_image(image, 255)
+        image = rescale(image, 255)
     image = image.astype(np.uint8)
     return PIL.Image.fromarray(image)
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index aeda45a2286642..15605781d11a17 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -22,7 +22,7 @@
 from transformers.utils.generic import TensorType
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import rescale_image, resize, to_channel_dimension_format
+from ...image_transforms import rescale, resize, to_channel_dimension_format
 from ...image_utils import ChannelDimension, get_image_size, is_batched, to_numpy_array, valid_images
 from ...utils import logging
 
@@ -93,7 +93,7 @@ def resize(
         image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
         return image
 
-    def rescale_image(
+    def rescale(
         self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs
     ) -> np.ndarray:
         """
@@ -110,7 +110,7 @@ def rescale_image(
                 - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return rescale_image(image=image, scale=scale, data_format=data_format, **kwargs)
+        return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
@@ -172,7 +172,7 @@ def preprocess(
             images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
 
         if do_rescale:
-            images = [self.rescale_image(image, scale=1 / 255) for image in images]
+            images = [self.rescale(image, scale=1 / 255) for image in images]
 
         images = [to_channel_dimension_format(image, data_format) for image in images]
 

From 4e60a7635a315f2e2e623a523220113b5bd0331f Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 24 Aug 2022 18:48:33 +0100
Subject: [PATCH 152/162] Add normalize method to transforms library

---
 src/transformers/image_transforms.py | 52 +++++++++++++++++++++++++++-
 src/transformers/image_utils.py      | 19 ++++++++++
 tests/test_image_transforms.py       | 28 +++++++++++++++
 tests/utils/test_image_utils.py      | 25 ++++++++++++-
 4 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 5c88efb6601340..2294e518dbaa1d 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -25,6 +25,7 @@
 
     from .image_utils import (
         ChannelDimension,
+        get_channel_dimension_axis,
         get_image_size,
         infer_channel_dimension_format,
         is_jax_tensor,
@@ -251,3 +252,52 @@ def resize(
         resized_image = np.array(resized_image)
         resized_image = to_channel_dimension_format(resized_image, data_format)
     return resized_image
+
+
+def normalize(
+    image,
+    mean: Union[float, Iterable[float]],
+    std: Union[float, Iterable[float]],
+    data_format: Optional[ChannelDimension] = None,
+) -> np.ndarray:
+    """
+    Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+
+    image = (image - mean) / std
+
+    Args:
+        image (`np.ndarray`):
+            The image to normalize.
+        mean (`float` or `Iterable[float]`):
+            The mean to use for normalization.
+        std (`float` or `Iterable[float]`):
+            The standard deviation to use for normalization.
+        data_format (`ChannelDimension`, *optional*, defaults to `None`):
+            The channel dimension format of the output image. If `None`, will use the inferred format from the input.
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    input_data_format = infer_channel_dimension_format(image)
+    channel_axis = get_channel_dimension_axis(image)
+    num_channels = image.shape[channel_axis]
+
+    if isinstance(mean, Iterable):
+        if len(mean) != num_channels:
+            raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
+    else:
+        mean = [mean] * num_channels
+
+    if isinstance(std, Iterable):
+        if len(std) != num_channels:
+            raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
+    else:
+        std = [std] * num_channels
+
+    if input_data_format == ChannelDimension.LAST:
+        image = (image - mean) / std
+    else:
+        image = ((image.T - mean) / std).T
+
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
+    return image
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index cc4b60080f8056..3f9133f5cd084a 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -112,6 +112,25 @@ def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
     raise ValueError("Unable to infer channel dimension format")
 
 
+def get_channel_dimension_axis(image: np.ndarray) -> int:
+    """
+    Returns the channel dimension axis of the image.
+
+    Args:
+        image (`np.ndarray`):
+            The image to get the channel dimension axis of.
+
+    Returns:
+        The channel dimension axis of the image.
+    """
+    channel_dim = infer_channel_dimension_format(image)
+    if channel_dim == ChannelDimension.FIRST:
+        return image.ndim - 3
+    elif channel_dim == ChannelDimension.LAST:
+        return image.ndim - 1
+    raise ValueError(f"Unsupported data format: {channel_dim}")
+
+
 def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
     """
     Returns the (height, width) dimensions of the image.
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 69e6de1587b8d6..5e2f26a00b5038 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -36,6 +36,7 @@
 
     from transformers.image_transforms import (
         get_resize_output_image_size,
+        normalize,
         resize,
         to_channel_dimension_format,
         to_pil_image,
@@ -172,3 +173,30 @@ def test_resize(self):
         self.assertIsInstance(resized_image, PIL.Image.Image)
         # PIL size is in (width, height) order
         self.assertEqual(resized_image.size, (40, 30))
+
+    def test_normalize(self):
+        image = np.random.randint(0, 256, (224, 224, 3)) / 255
+
+        # Test that exception is raised if inputs are incorrect
+        # Not a numpy array image
+        with self.assertRaises(ValueError):
+            normalize(5, 5, 5)
+
+        # Number of mean values != number of channels
+        with self.assertRaises(ValueError):
+            normalize(image, mean=(0.5, 0.6), std=1)
+
+        # Number of std values != number of channels
+        with self.assertRaises(ValueError):
+            normalize(image, mean=1, std=(0.5, 0.6))
+
+        # Test result is correct - output data format is channels_first and normalization
+        # correctly computed
+        mean = (0.5, 0.6, 0.7)
+        std = (0.1, 0.2, 0.3)
+        expected_image = ((image - mean) / std).transpose((2, 0, 1))
+
+        normalized_image = normalize(image, mean=mean, std=std, data_format="channels_first")
+        self.assertIsInstance(normalized_image, np.ndarray)
+        self.assertEqual(normalized_image.shape, (3, 224, 224))
+        self.assertTrue(np.allclose(normalized_image, expected_image))
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index 0d9999d33aef6d..b2502b0349e798 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -20,7 +20,7 @@
 import pytest
 
 from transformers import is_torch_available, is_vision_available
-from transformers.image_utils import ChannelDimension
+from transformers.image_utils import ChannelDimension, get_channel_dimension_axis
 from transformers.testing_utils import require_torch, require_vision
 
 
@@ -535,3 +535,26 @@ def test_infer_channel_dimension(self):
         image = np.random.randint(0, 256, (1, 3, 4, 5))
         inferred_dim = infer_channel_dimension_format(image)
         self.assertEqual(inferred_dim, ChannelDimension.FIRST)
+
+    def test_get_channel_dimension_axis(self):
+        # Test we correctly identify the channel dimension
+        image = np.random.randint(0, 256, (3, 4, 5))
+        inferred_axis = get_channel_dimension_axis(image)
+        self.assertEqual(inferred_axis, 0)
+
+        image = np.random.randint(0, 256, (1, 4, 5))
+        inferred_axis = get_channel_dimension_axis(image)
+        self.assertEqual(inferred_axis, 0)
+
+        image = np.random.randint(0, 256, (4, 5, 3))
+        inferred_axis = get_channel_dimension_axis(image)
+        self.assertEqual(inferred_axis, 2)
+
+        image = np.random.randint(0, 256, (4, 5, 1))
+        inferred_axis = get_channel_dimension_axis(image)
+        self.assertEqual(inferred_axis, 2)
+
+        # We can take a batched array of images and find the dimension
+        image = np.random.randint(0, 256, (1, 3, 4, 5))
+        inferred_axis = get_channel_dimension_axis(image)
+        self.assertEqual(inferred_axis, 1)

From 6ec76ffbfbba768920f5ec9b36a0d306ee347919 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 2 Sep 2022 11:45:54 +0100
Subject: [PATCH 153/162] Apply suggestions from code review - remove defaults
 to None

Co-authored-by: Sylvain Gugger <Sylvain.gugger@gmail.com>

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/image_transforms.py                  | 4 ++--
 src/transformers/models/glpn/image_processing_glpn.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 5c88efb6601340..da26f8bfe3e05a 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -166,7 +166,7 @@ def get_resize_output_image_size(
             (`size`,`size`). If set to `False`, will replicate
             [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
             with support for resizing only the smallest edge and providing an optional `max_size`.
-        max_size (`int`, *optional*, defaults to `None`):
+        max_size (`int`, *optional*):
             The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
             than `max_size` after being resized according to `size`, then the image is resized again so that the longer
             edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
@@ -222,7 +222,7 @@ def resize(
             The size to use for resizing the image.
         resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
             The filter to user for resampling.
-        data_format (`ChannelDimension`, *optional*, defaults to `None`):
+        data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
         return_numpy (`bool`, *optional*, defaults to `True`):
             Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned.
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 15605781d11a17..0751cdfd6149cd 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -139,7 +139,7 @@ def preprocess(
             resample (`int`, *optional*, defaults to `self.resample`):
                 Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image.Resampling`,
                 Only has an effect if `do_resize` is set to `True`.
-            return_tensors (`str`, *optional*, defaults to `None`):
+            return_tensors (`str`, *optional*):
                 The type of tensors to return. Can be one of:
                     - `None`: Return a list of `np.ndarray`.
                     - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.

From 48a07a1c861648e1cfb9b4f297798698d3a7b399 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 12 Oct 2022 15:15:52 +0100
Subject: [PATCH 154/162] Fix docstrings and revert to PIL.Image.XXX resampling

Use PIL.Image.XXX resampling values instead of PIL.Image.Resampling.XXX enum as it's only in the recent version >= 9.10 and version is not yet pinned and older version support deprecated
---
 src/transformers/image_transforms.py                  | 8 +++-----
 src/transformers/models/glpn/image_processing_glpn.py | 4 ++--
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index da26f8bfe3e05a..8580e12adaec60 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -111,9 +111,9 @@ def to_pil_image(
     needed.
 
     Args:
-        image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`):
+        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
             The image to convert to the PIL Image format.
-        rescale (`bool`, *optional*):
+        do_rescale (`bool`, *optional*):
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
             to `True` if the image type is a floating type, `False` otherwise.
     """
@@ -159,8 +159,6 @@ def get_resize_output_image_size(
             If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
             `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
             number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
-        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
-            The filter to user for resampling.
         default_to_square (`bool`, *optional*, defaults to `True`):
             How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
             (`size`,`size`). If set to `False`, will replicate
@@ -208,7 +206,7 @@ def get_resize_output_image_size(
 def resize(
     image,
     size: Tuple[int, int],
-    resample=PIL.Image.Resampling.BILINEAR,
+    resample=PIL.Image.BILINEAR,
     data_format: Optional[ChannelDimension] = None,
     return_numpy: bool = True,
 ) -> np.ndarray:
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 0751cdfd6149cd..efad553733c8c2 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -44,14 +44,14 @@ class GLPNImageProcessor(BaseImageProcessor):
         size_divisor (`int`, *optional*, defaults to 32):
             Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so
             their height and width are rounded down to the closest multiple of `size_divisor`.
-        resample (`PIL.Image.Resampling`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+        resample (`PIL.Image.Resampling`, *optional*, defaults to `PIL.Image.BILINEAR`):
             Set the class default for `resample`. Defines the resampling filter to use if resizing the image.
     """
 
     model_input_names = ["pixel_values"]
 
     def __init__(
-        self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs
+        self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.BILINEAR, **kwargs
     ) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale

From 8785229045265aed6c2909ba15a4f7b4dbc887cc Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 12 Oct 2022 15:50:41 +0100
Subject: [PATCH 155/162] Some more docstrings and PIL.Image tidy up

---
 src/transformers/image_transforms.py          |  9 +++--
 .../models/glpn/image_processing_glpn.py      | 38 +++++++++----------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 8580e12adaec60..92c7fca9b0f6e8 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -74,7 +74,7 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
 
 
 def rescale(
-    image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None, dtype=np.float32
+    image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, dtype=np.float32
 ) -> np.ndarray:
     """
     Rescales `image` by `scale`.
@@ -82,7 +82,7 @@ def rescale(
     Args:
         image (`np.ndarray`):
             The image to rescale.
-        scale (`float` or `int`, *optional*, defaults to 255):
+        scale (`float`):
             The scale to use for rescaling the image.
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
@@ -104,7 +104,8 @@ def rescale(
 
 
 def to_pil_image(
-    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.Tensor"], do_rescale=None
+    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.Tensor"],
+    do_rescale: Optional[bool] = None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -143,7 +144,7 @@ def get_resize_output_image_size(
     input_image: np.ndarray,
     size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
-    max_size: int = None,
+    max_size: Optional[int] = None,
 ) -> tuple:
     """
     Find the target (height, width) dimension of the output image after resizing given the input image and the desired
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index efad553733c8c2..ffd5fadad987d8 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -44,14 +44,19 @@ class GLPNImageProcessor(BaseImageProcessor):
         size_divisor (`int`, *optional*, defaults to 32):
             Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so
             their height and width are rounded down to the closest multiple of `size_divisor`.
-        resample (`PIL.Image.Resampling`, *optional*, defaults to `PIL.Image.BILINEAR`):
+        resample (`PIL.Image` resampling filter, *optional*, defaults to `PIL.Image.BILINEAR`):
             Set the class default for `resample`. Defines the resampling filter to use if resizing the image.
     """
 
     model_input_names = ["pixel_values"]
 
     def __init__(
-        self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.BILINEAR, **kwargs
+        self,
+        do_resize: bool = True,
+        do_rescale: bool = True,
+        size_divisor: int = 32,
+        resample=PIL.Image.BILINEAR,
+        **kwargs
     ) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale
@@ -60,12 +65,7 @@ def __init__(
         super().__init__(**kwargs)
 
     def resize(
-        self,
-        image: np.ndarray,
-        size_divisor: int,
-        resample: PIL.Image.Resampling,
-        data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        self, image: np.ndarray, size_divisor: int, resample, data_format: Optional[ChannelDimension] = None, **kwargs
     ) -> np.ndarray:
         """
         Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor.
@@ -78,8 +78,8 @@ def resize(
             size_divisor (`int`):
                 The image is resized so its height and width are rounded down to the closest multiple of
                 `size_divisor`.
-            resample (`PIL.Image.Resampling`):
-                Resampling filter to use when resizing the image.
+            resample:
+                PIL.Image resampling filter to use when resizing the image e.g. PIL.Image.BILINEAR.
             data_format (`ChannelDimension`, *optional*):
                 The channel dimension format for the output image. If `None`, the channel dimension format of the input
                 image is used. Can be one of:
@@ -94,7 +94,7 @@ def resize(
         return image
 
     def rescale(
-        self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs
+        self, image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, **kwargs
     ) -> np.ndarray:
         """
         Rescale the image by the given scaling factor `scale`.
@@ -102,7 +102,7 @@ def rescale(
         Args:
             image (`np.ndarray`):
                 The image to rescale.
-            scale (`int` or `float`):
+            scale (`float`):
                 The scaling factor to rescale pixel values by.
             data_format (`ChannelDimension`, *optional*):
                 The channel dimension format for the output image. If `None`, the channel dimension format of the input
@@ -115,10 +115,10 @@ def rescale(
     def preprocess(
         self,
         images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
-        do_resize: bool = None,
-        do_rescale: bool = None,
-        size_divisor: int = None,
-        resample: PIL.Image.Resampling = None,
+        do_resize: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
+        resample=None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         **kwargs
@@ -136,9 +136,9 @@ def preprocess(
             size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
                 When `do_resize` is `True`, images are resized so their height and width are rounded down to the
                 closest multiple of `size_divisor`.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image.Resampling`,
-                Only has an effect if `do_resize` is set to `True`.
+            resample (`PIL.Image` resampling filter, *optional*, defaults to `self.resample`):
+                PIL.Image resampling filter to use if resizing the image e.g. PIL.Image.BILINEAR. Only has an effect if
+                `do_resize` is set to `True`.
             return_tensors (`str`, *optional*):
                 The type of tensors to return. Can be one of:
                     - `None`: Return a list of `np.ndarray`.

From d44fe63653b87b5d7eca9449b0ed154c9e77b654 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 12 Oct 2022 16:22:24 +0100
Subject: [PATCH 156/162] Reorganise arguments so flags by modifiers

---
 .../models/glpn/image_processing_glpn.py           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index ffd5fadad987d8..08f2e601b0a810 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -38,14 +38,14 @@ class GLPNImageProcessor(BaseImageProcessor):
         do_resize (`bool`, *optional*, defaults to `True`):
             Set the class default for the `do_resize` parameter. Controls whether to resize the image's (height, width)
             dimensions, rounding them down to the closest multiple of `size_divisor`.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Set the class default for the `do_rescale` parameter. Controls whether or not to apply the scaling factor
-            (to make pixel values floats between 0. and 1.).
         size_divisor (`int`, *optional*, defaults to 32):
             Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so
             their height and width are rounded down to the closest multiple of `size_divisor`.
         resample (`PIL.Image` resampling filter, *optional*, defaults to `PIL.Image.BILINEAR`):
             Set the class default for `resample`. Defines the resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Set the class default for the `do_rescale` parameter. Controls whether or not to apply the scaling factor
+            (to make pixel values floats between 0. and 1.).
     """
 
     model_input_names = ["pixel_values"]
@@ -53,9 +53,9 @@ class GLPNImageProcessor(BaseImageProcessor):
     def __init__(
         self,
         do_resize: bool = True,
-        do_rescale: bool = True,
         size_divisor: int = 32,
         resample=PIL.Image.BILINEAR,
+        do_rescale: bool = True,
         **kwargs
     ) -> None:
         self.do_resize = do_resize
@@ -116,9 +116,9 @@ def preprocess(
         self,
         images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
         do_resize: Optional[bool] = None,
-        do_rescale: Optional[bool] = None,
         size_divisor: Optional[int] = None,
         resample=None,
+        do_rescale: Optional[bool] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         **kwargs
@@ -131,14 +131,14 @@ def preprocess(
                 The image or images to preprocess.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
             size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
                 When `do_resize` is `True`, images are resized so their height and width are rounded down to the
                 closest multiple of `size_divisor`.
             resample (`PIL.Image` resampling filter, *optional*, defaults to `self.resample`):
                 PIL.Image resampling filter to use if resizing the image e.g. PIL.Image.BILINEAR. Only has an effect if
                 `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
             return_tensors (`str`, *optional*):
                 The type of tensors to return. Can be one of:
                     - `None`: Return a list of `np.ndarray`.

From 83330efb3bbdfe03ed8e769d82580455a655a827 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 12 Oct 2022 17:12:20 +0100
Subject: [PATCH 157/162] Few last docstring fixes

---
 src/transformers/image_transforms.py            | 17 ++++++++++++-----
 .../models/glpn/image_processing_glpn.py        | 12 +++++++++---
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 92c7fca9b0f6e8..024b46911a750a 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -53,7 +53,7 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
             The channel dimension format to use.
 
     Returns:
-        image: A converted np.ndarray.
+        `np.ndarray`: The image with the channel dimension set to `channel_dim`.
     """
     if not isinstance(image, np.ndarray):
         raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
@@ -91,7 +91,7 @@ def rescale(
             extractors.
 
     Returns:
-        image: A rescaled np.ndarray image.
+        `np.ndarray`: The rescaled image.
     """
     if not isinstance(image, np.ndarray):
         raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
@@ -113,10 +113,13 @@ def to_pil_image(
 
     Args:
         image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`):
-            The image to convert to the PIL Image format.
+            The image to convert to the `PIL.Image` format.
         do_rescale (`bool`, *optional*):
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
             to `True` if the image type is a floating type, `False` otherwise.
+
+    Returns:
+        `PIL.Image.Image`: The converted image.
     """
     if isinstance(image, PIL.Image.Image):
         return image
@@ -170,6 +173,9 @@ def get_resize_output_image_size(
             than `max_size` after being resized according to `size`, then the image is resized again so that the longer
             edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
             than `size`. Only used if `default_to_square` is `False`.
+
+    Returns:
+        `tuple`: The target (height, width) dimension of the output image after resizing.
     """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
@@ -224,10 +230,11 @@ def resize(
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
         return_numpy (`bool`, *optional*, defaults to `True`):
-            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned.
+            Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
+            returned.
 
     Returns:
-        image: A resized np.ndarray.
+        `np.ndarray`: The resized image.
     """
     if not len(size) == 2:
         raise ValueError("size must have 2 elements")
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 08f2e601b0a810..98ae1d53f73d60 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -79,12 +79,15 @@ def resize(
                 The image is resized so its height and width are rounded down to the closest multiple of
                 `size_divisor`.
             resample:
-                PIL.Image resampling filter to use when resizing the image e.g. PIL.Image.BILINEAR.
+                `PIL.Image` resampling filter to use when resizing the image e.g. `PIL.Image.BILINEAR`.
             data_format (`ChannelDimension`, *optional*):
                 The channel dimension format for the output image. If `None`, the channel dimension format of the input
                 image is used. Can be one of:
                 - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         height, width = get_image_size(image)
         # Rounds the height and width down to the closest multiple of size_divisor
@@ -109,6 +112,9 @@ def rescale(
                 image is used. Can be one of:
                 - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The rescaled image.
         """
         return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
 
@@ -135,8 +141,8 @@ def preprocess(
                 When `do_resize` is `True`, images are resized so their height and width are rounded down to the
                 closest multiple of `size_divisor`.
             resample (`PIL.Image` resampling filter, *optional*, defaults to `self.resample`):
-                PIL.Image resampling filter to use if resizing the image e.g. PIL.Image.BILINEAR. Only has an effect if
-                `do_resize` is set to `True`.
+                `PIL.Image` resampling filter to use if resizing the image e.g. `PIL.Image.BILINEAR`. Only has an
+                effect if `do_resize` is set to `True`.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                 Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
             return_tensors (`str`, *optional*):

From 292f786843948fe0759a178853ca77e1baa69b50 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 12 Oct 2022 19:29:44 +0100
Subject: [PATCH 158/162] Add normalize to docs

---
 docs/source/en/internal/image_processing_utils.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index 1ec890e9e1f786..857d48f0fe6e98 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -19,6 +19,8 @@ Most of those are only useful if you are studying the code of the image processo
 
 ## Image Transformations
 
+[[autodoc]] image_transforms.normalize
+
 [[autodoc]] image_transforms.rescale
 
 [[autodoc]] image_transforms.resize

From 7ea393cc76b947ec02e38c731ce6c81dfd465578 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 12 Oct 2022 19:36:17 +0100
Subject: [PATCH 159/162] Accept PIL.Image inputs with deprecation warning

---
 src/transformers/image_transforms.py | 12 +++++++++---
 tests/test_image_transforms.py       |  5 -----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 6fa8f018e747a9..ff56487025c083 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -31,6 +32,7 @@
         is_jax_tensor,
         is_tf_tensor,
         is_torch_tensor,
+        to_numpy_array,
     )
 
 
@@ -261,7 +263,7 @@ def resize(
 
 
 def normalize(
-    image,
+    image: np.ndarray,
     mean: Union[float, Iterable[float]],
     std: Union[float, Iterable[float]],
     data_format: Optional[ChannelDimension] = None,
@@ -281,8 +283,12 @@ def normalize(
         data_format (`ChannelDimension`, *optional*, defaults to `None`):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
     """
-    if not isinstance(image, np.ndarray):
-        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+    if isinstance(image, PIL.Image.Image):
+        warnings.warn("PIL will not be supported as input in the next release. Please use numpy arrays instead.")
+        # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
+        # casting to numpy array and dividing by 255.
+        image = to_numpy_array(image)
+        image = rescale(image, scale=1 / 255)
 
     input_data_format = infer_channel_dimension_format(image)
     channel_axis = get_channel_dimension_axis(image)
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 5e2f26a00b5038..ee51bd358f40b2 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -177,11 +177,6 @@ def test_resize(self):
     def test_normalize(self):
         image = np.random.randint(0, 256, (224, 224, 3)) / 255
 
-        # Test that exception is raised if inputs are incorrect
-        # Not a numpy array image
-        with self.assertRaises(ValueError):
-            normalize(5, 5, 5)
-
         # Number of mean values != number of channels
         with self.assertRaises(ValueError):
             normalize(image, mean=(0.5, 0.6), std=1)

From 0f996616395601b0e0ce3f59334726c2f4a2f838 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 17 Oct 2022 13:26:06 +0100
Subject: [PATCH 160/162] Update src/transformers/image_transforms.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/image_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index ff56487025c083..48f2d4d15a1df7 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -280,7 +280,7 @@ def normalize(
             The mean to use for normalization.
         std (`float` or `Iterable[float]`):
             The standard deviation to use for normalization.
-        data_format (`ChannelDimension`, *optional*, defaults to `None`):
+        data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
     """
     if isinstance(image, PIL.Image.Image):

From 799829bcbe953232a50d66efee2a339fefb12ddc Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 17 Oct 2022 13:26:55 +0100
Subject: [PATCH 161/162] Update warning to include version

---
 src/transformers/image_transforms.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 48f2d4d15a1df7..04d8332be11ccc 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -284,7 +284,10 @@ def normalize(
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
     """
     if isinstance(image, PIL.Image.Image):
-        warnings.warn("PIL will not be supported as input in the next release. Please use numpy arrays instead.")
+        warnings.warn(
+            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
+            FutureWarning,
+        )
         # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
         # casting to numpy array and dividing by 255.
         image = to_numpy_array(image)

From d396382f7c90b69577210fbea15f0b8227e2b78d Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 17 Oct 2022 16:40:20 +0100
Subject: [PATCH 162/162] Trigger CI - hash clash on doc build