From ffa4bf8bbadd130465b95814ab20598c22506a92 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Mon, 15 Mar 2021 20:58:43 +0100
Subject: [PATCH 01/44] Fix rebase with master

---
 docs/source/model_doc/vit.rst                 |   99 ++
 src/transformers/__init__.py                  |   19 +
 src/transformers/image_processor_utils.py     |  761 ++++++++++++
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |   10 +
 src/transformers/models/auto/modeling_auto.py |    6 +
 src/transformers/models/vit/__init__.py       |   67 +
 .../models/vit/configuration_vit.py           |  147 +++
 .../models/vit/convert_vit_timm_to_pytorch.py |  194 +++
 .../models/vit/image_processor_vit.py         |  250 ++++
 src/transformers/models/vit/modeling_vit.py   | 1102 +++++++++++++++++
 tests/test_modeling_vit.py                    |  478 +++++++
 12 files changed, 3134 insertions(+)
 create mode 100644 docs/source/model_doc/vit.rst
 create mode 100644 src/transformers/image_processor_utils.py
 create mode 100644 src/transformers/models/vit/__init__.py
 create mode 100644 src/transformers/models/vit/configuration_vit.py
 create mode 100644 src/transformers/models/vit/convert_vit_timm_to_pytorch.py
 create mode 100644 src/transformers/models/vit/image_processor_vit.py
 create mode 100644 src/transformers/models/vit/modeling_vit.py
 create mode 100644 tests/test_modeling_vit.py
diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
new file mode 100644
index 00000000000000..b9082b9562ed79
--- /dev/null
+++ b/docs/source/model_doc/vit.rst
@@ -0,0 +1,99 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+ViT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ViT model was proposed in `<INSERT PAPER NAME HERE>
+<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+ViTConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTConfig
+    :members:
+
+
+ViTTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+ViTTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+ViTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTModel
+    :members: forward
+
+
+ViTForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTForCausalLM
+    :members: forward
+
+
+ViTForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTForMaskedLM
+    :members: forward
+
+
+ViTForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTForSequenceClassification
+    :members: forward
+
+
+ViTForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTForMultipleChoice
+    :members: forward
+
+
+ViTForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTForTokenClassification
+    :members: forward
+
+
+ViTForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTForQuestionAnswering
+    :members: forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 57854cbefcb0cc..f132dff66198bc 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -126,6 +126,7 @@
     ],
     "models": [],
     # Models
+    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig", "ViTImageProcessor"],
     "models.wav2vec2": [
         "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Wav2Vec2Config",
@@ -397,6 +398,14 @@
             "SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
             "Speech2TextForConditionalGeneration",
             "Speech2TextModel",
+    _import_structure["models.vit"].extend(
+        [
+            "VIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ViTForImageClassification",
+            "ViTLayer",
+            "ViTModel",
+            "ViTPreTrainedModel",
+            "load_tf_weights_in_vit",
         ]
     )
 
@@ -1334,6 +1343,7 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+    from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig, ViTImageProcessor
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
@@ -1537,6 +1547,15 @@
     # Modeling
     if is_torch_available():
 
+        from .models.vit import (
+            VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTForImageClassification,
+            ViTLayer,
+            ViTModel,
+            ViTPreTrainedModel,
+            load_tf_weights_in_vit,
+        )
+
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
diff --git a/src/transformers/image_processor_utils.py b/src/transformers/image_processor_utils.py
new file mode 100644
index 00000000000000..8e454a1d30edc4
--- /dev/null
+++ b/src/transformers/image_processor_utils.py
@@ -0,0 +1,761 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Image processor common class for python image processors.
+"""
+import copy
+import json
+import os
+from collections import UserDict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from .file_utils import (
+    PaddingStrategy,
+    TensorType,
+    _is_jax,
+    _is_numpy,
+    _is_tensorflow,
+    _is_torch,
+    _is_torch_device,
+    add_end_docstrings,
+    cached_path,
+    hf_bucket_url,
+    is_flax_available,
+    is_remote_url,
+    is_tf_available,
+    is_torch_available,
+    to_py_obj,
+    torch_required,
+)
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+
+
+class BatchImages(UserDict):
+    r"""
+    Holds the output of the :meth:`~transformers.ImageProcessor.pad` and image processor specific ``__call__`` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+
+    Args:
+        data (:obj:`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'pixel_mask',
+            etc.).
+        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
+    """
+
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+
+    def __getitem__(self, item: str) -> Union[Any]:
+        """
+        If the key is a string, returns the value of the dict associated to :obj:`key` ('pixel_values',
+        'pixel_mask', etc.).
+        """
+        if isinstance(item, str):
+            return self.data[item]
+        else:
+            raise KeyError("Indexing with integers is not available when using Python based image processors")
+
+    def __getattr__(self, item: str):
+        try:
+            return self.data[item]
+        except KeyError:
+            raise AttributeError
+
+    def __getstate__(self):
+        return {"data": self.data}
+
+    def __setstate__(self, state):
+        if "data" in state:
+            self.data = state["data"]
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.keys
+    def keys(self):
+        return self.data.keys()
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.values
+    def values(self):
+        return self.data.values()
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.items
+    def items(self):
+        return self.data.items()
+
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        """
+        Convert the inner content to tensors.
+
+
+        Args:
+            tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
+                :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
+        """
+        if tensor_type is None:
+            return self
+
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.TENSORFLOW:
+            if not is_tf_available():
+                raise ImportError(
+                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
+                )
+            import tensorflow as tf
+
+            as_tensor = tf.constant
+            is_tensor = tf.is_tensor
+        elif tensor_type == TensorType.PYTORCH:
+            if not is_torch_available():
+                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+            import torch
+
+            as_tensor = torch.tensor
+            is_tensor = torch.is_tensor
+        elif tensor_type == TensorType.JAX:
+            if not is_flax_available():
+                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
+            import jax.numpy as jnp  # noqa: F811
+
+            as_tensor = jnp.array
+            is_tensor = _is_jax
+        else:
+            as_tensor = np.asarray
+            is_tensor = _is_numpy
+
+        # Do the tensor conversion in batch
+        for key, value in self.items():
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+
+                    self[key] = tensor
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same resolution."
+                )
+
+        return self
+
+    @torch_required
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchImages
+    def to(self, device: Union[str, "torch.device"]) -> "BatchImages":
+        """
+        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
+
+
+        Args:
+            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
+
+
+        Returns:
+            :class:`~transformers.BatchImages`: The same instance of :class:`~transformers.BatchImages` after
+            modification.
+        """
+
+        # This check catches things like APEX blindly calling "to" on all inputs to a module
+        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
+        # into a HalfTensor
+        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
+            self.data = {k: v.to(device=device) for k, v in self.data.items()}
+        else:
+            logger.warning(f"Attempting to cast a BatchImages to type {str(device)}. This is not supported.")
+        return self
+
+
+class PreTrainedImageProcessor:
+    """
+    This is a general image processor class for vision-related tasks.
+
+
+    Args:
+        image_mean (:obj:`List[float]`):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (:obj:`List[Float]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+        padding_value (:obj:`float`):
+            The value that is used to fill the padding pixels.
+    """
+
+    def __init__(self, image_mean: int, image_std: int, padding_value: float, **kwargs):
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.padding_value = padding_value
+
+        self.return_pixel_mask = kwargs.pop("return_pixel_mask", True)
+
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PreTrainedImageProcessor":
+        r"""
+        Instantiate a :class:`~transformers.PreTrainedImageProcessor` (or a derived class) from a pretrained image
+        processor.
+
+
+        Args:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+                This can be either:
+
+
+                - a string, the `model id` of a pretrained image_processor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
+                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a image processor file saved using the
+                  :func:`~transformers.PreTrainedImageProcessor.save_pretrained` method, e.g.,
+                  ``./my_model_directory/``.
+                - a path or url to a saved image processor JSON `file`, e.g.,
+                  ``./my_model_directory/feature_extraction_config.json``.
+            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                Path to a directory in which a downloaded pretrained model image processor should be cached if the
+                standard cache should not be used.
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (:obj:`Dict[str, str]`, `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (:obj:`str` or `bool`, `optional`):
+                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
+                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                identifier allowed by git.
+            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If :obj:`False`, then this function returns just the final image processor object.
+
+                If :obj:`True`, then this functions returns a :obj:`Tuple(image_processor, unused_kwargs)` where
+                `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not image processor
+                attributes: i.e., the part of ``kwargs`` which has not been used to update ``image_processor`` and is
+                otherwise ignored.
+            kwargs (:obj:`Dict[str, Any]`, `optional`):
+                The values in kwargs of any keys which are image processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
+                controlled by the ``return_unused_kwargs`` keyword parameter.
+
+        .. note::
+
+            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+
+
+
+        Returns:
+            :class:`~transformers.PreTrainedImageProcessor`: The image processor object instantiated from this
+            pretrained model.
+
+
+        Examples::
+
+            # We can't instantiate directly the base class `PreTrainedImageProcessor` so let's show the examples on a
+            # derived class: DetrImageProcessor
+            image_processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50')    # Download image_processor_config from huggingface.co and cache.
+            image_processor = DetrImageProcessor.from_pretrained('./test/saved_model/')  # E.g. image_processor (or model) was saved using `save_pretrained('./test/saved_model/')`
+            image_processor = DetrImageProcessor.from_pretrained('./test/saved_model/image_processor_config.json')
+            image_processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50', return_pixel_mask=False, foo=False)
+            assert image_processor.return_pixel_mask is False
+            image_processor, unused_kwargs = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50', return_pixel_mask=False,
+                                                               foo=False, return_unused_kwargs=True)
+            assert image_processor.return_pixel_mask is False
+            assert unused_kwargs == {'foo': False}
+
+        """
+        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
+
+        return cls.from_dict(image_processor_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike]):
+        """
+        Save a image_processor object to the directory ``save_directory``, so that it can be re-loaded using the
+        :func:`~transformers.PreTrainedImageProcessor.from_pretrained` class method.
+
+
+        Args:
+            save_directory (:obj:`str` or :obj:`os.PathLike`):
+                Directory where the image processor JSON file will be saved (will be created if it does not exist).
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_image_processor_file = os.path.join(save_directory, FEATURE_EXTRACTOR_NAME)
+
+        self.to_json_file(output_image_processor_file)
+        logger.info(f"Configuration saved in {output_image_processor_file}")
+
+    @classmethod
+    def get_image_processor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
+        :class:`~transformers.PreTrainedImageProcessor` using ``from_dict``.
+
+
+        Parameters:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+
+
+        Returns:
+            :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            image_processor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            image_processor_file = pretrained_model_name_or_path
+        else:
+            image_processor_file = hf_bucket_url(
+                pretrained_model_name_or_path, filename=FEATURE_EXTRACTOR_NAME, revision=revision, mirror=None
+            )
+
+        try:
+            # Load from URL or cache if already cached
+            resolved_image_processor_file = cached_path(
+                image_processor_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+            )
+            # Load image_processor dict
+            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+
+        except EnvironmentError as err:
+            logger.error(err)
+            msg = (
+                f"Can't load image processor for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {FEATURE_EXTRACTOR_NAME} file\n\n"
+            )
+            raise EnvironmentError(msg)
+
+        except json.JSONDecodeError:
+            msg = (
+                f"Couldn't reach server at '{image_processor_file}' to download image processor configuration file or "
+                "image processor configuration file is not a valid JSON file. "
+                f"Please check network or file content here: {resolved_image_processor_file}."
+            )
+            raise EnvironmentError(msg)
+
+        if resolved_image_processor_file == image_processor_file:
+            logger.info(f"loading image processor configuration file {image_processor_file}")
+        else:
+            logger.info(
+                f"loading image processor configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
+            )
+
+        return image_processor_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs) -> "PreTrainedImageProcessor":
+        """
+        Instantiates a :class:`~transformers.PreTrainedImageProcessor` from a Python dictionary of parameters.
+
+
+        Args:
+            image_processor_dict (:obj:`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                :func:`~transformers.PreTrainedImageProcessor.to_dict` method.
+            kwargs (:obj:`Dict[str, Any]`):
+                Additional parameters from which to initialize the image processor object.
+
+
+        Returns:
+            :class:`~transformers.PreTrainedImageProcessor`: The image processor object instantiated from those
+            parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        image_processor = cls(**image_processor_dict)
+
+        # Update image_processor with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(image_processor, key):
+                setattr(image_processor, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info(f"Image processor {image_processor}")
+        if return_unused_kwargs:
+            return image_processor, kwargs
+        else:
+            return image_processor
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+
+        Returns:
+            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PreTrainedImageProcessor":
+        """
+        Instantiates a :class:`~transformers.PreTrainedImageProcessor` from the path to a JSON file of parameters.
+
+
+        Args:
+            json_file (:obj:`str` or :obj:`os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+
+        Returns:
+            :class:`~transformers.PreTrainedImageProcessor`: The image_processor object instantiated from that JSON
+            file.
+
+        """
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        image_processor_dict = json.loads(text)
+        return cls(**image_processor_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+
+        Returns:
+            :obj:`str`: String containing all the attributes that make up this image_processor instance in JSON format.
+        """
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+
+        Args:
+            json_file_path (:obj:`str` or :obj:`os.PathLike`):
+                Path to the JSON file in which this image_processor instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def pad(
+        self,
+        processed_images: Union[
+            BatchImages,
+            List[BatchImages],
+            Dict[str, BatchImages],
+            Dict[str, List[BatchImages]],
+            List[Dict[str, BatchImages]],
+        ],
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_resolution: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_pixel_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchImages:
+        """
+        Pad input values or a batch of input values up to predefined resolution or to the max resolution in the batch.
+
+        Padding values are defined at the image processor level (with ``self.padding_value``).
+
+        .. note::
+
+            If the ``processed_images`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
+            the result will use the same type unless you provide a different tensor type with ``return_tensors``. In
+            the case of PyTorch tensors, you will lose the specific device of your tensors however.
+
+
+        Args:
+            processed_images (:class:`~transformers.BatchImages`, list of :class:`~transformers.BatchImages`, :obj:`Dict[str, List[float]]`, :obj:`Dict[str, List[List[float]]` or :obj:`List[Dict[str, List[float]]]`):
+                Processed inputs. Can represent one input (:class:`~transformers.BatchImages` or :obj:`Dict[str,
+                List[float]]`) or a batch of input values / vectors (list of :class:`~transformers.BatchImages`,
+                `Dict[str, List[List[float]]]` or `List[Dict[str, List[float]]]`) so you can use this method during
+                preprocessing as well as in a PyTorch Dataloader collate function.
+
+                Instead of :obj:`List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
+                tensors), see the note above for the return type.
+            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+
+                * :obj:`True` or :obj:`'biggest'`: Pad to the biggest image in the batch (or no padding if only a
+                  single image if provided).
+                * :obj:`'max_resolution'`: Pad to a maximum resolution specified with the argument
+                  :obj:`max_resolution` or to the maximum acceptable input resolution for the model if that argument is
+                  not provided.
+                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                  different resolutions).
+            max_resolution (:obj:`int`, `optional`):
+                Maximum resolution of the returned list and optionally padding length (see above).
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_pixel_mask (:obj:`bool`, `optional`):
+                Whether to return the pixel mask. If left to the default, will return the pixel mask according
+                to the specific image_processor's default.
+
+                `What are pixel masks? <../glossary.html#attention-mask>`__
+            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+
+                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+        """
+        # If we have a list of dicts, let's convert it in a dict of lists
+        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+        if isinstance(processed_images, (list, tuple)) and isinstance(processed_images[0], (dict, BatchImages)):
+            processed_images = {
+                key: [example[key] for example in processed_images] for key in processed_images[0].keys()
+            }
+
+        # The model's main input name, usually `pixel_values`, has be passed for padding
+        if self.model_input_names[0] not in processed_images:
+            raise ValueError(
+                "You should supply an instance of :class:`~transformers.BatchImages` or list of :class:`~transformers.BatchImages` to this method"
+                f"that includes {self.model_input_names[0]}, but you provided {list(processed_images.keys())}"
+            )
+
+        required_input = processed_images[self.model_input_names[0]]
+        return_pixel_mask = (
+            return_pixel_mask if return_pixel_mask is not None else self.return_pixel_mask
+        )
+
+        if not required_input:
+            if return_pixel_mask:
+                processed_images["pixel_mask"] = []
+            return processed_images
+
+        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
+        # and rebuild them afterwards if no return_tensors is specified
+        # Note that we lose the specific device the tensor may be on for PyTorch
+
+        first_element = required_input[0]
+        if isinstance(first_element, (list, tuple)):
+            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+            index = 0
+            while len(required_input[index]) == 0:
+                index += 1
+            if index < len(required_input):
+                first_element = required_input[index][0]
+        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+        if not isinstance(first_element, (float, int, list, tuple)):
+            if is_tf_available() and _is_tensorflow(first_element):
+                return_tensors = "tf" if return_tensors is None else return_tensors
+            elif is_torch_available() and _is_torch(first_element):
+                return_tensors = "pt" if return_tensors is None else return_tensors
+            elif isinstance(first_element, np.ndarray):
+                return_tensors = "np" if return_tensors is None else return_tensors
+            else:
+                raise ValueError(
+                    f"type of {first_element} unknown: {type(first_element)}. "
+                    f"Should be one of a python, numpy, pytorch or tensorflow object."
+                )
+
+            for key, value in processed_images.items():
+                processed_images[key] = to_py_obj(value)
+
+        # Convert padding_strategy in PaddingStrategy
+        padding_strategy, max_resolution, _ = self._get_padding_strategies(
+            padding=padding, max_resolution=max_resolution
+        )
+
+        required_input = processed_images[self.model_input_names[0]]
+        if required_input and not isinstance(required_input[0], (list, tuple)):
+            processed_images = self._pad(
+                processed_images,
+                max_resolution=max_resolution,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_pixel_mask=return_pixel_mask,
+            )
+            return BatchImages(processed_images, tensor_type=return_tensors)
+
+        batch_size = len(required_input)
+        assert all(
+            len(v) == batch_size for v in processed_images.values()
+        ), "Some items in the output dictionary have a different batch size than others."
+
+        if padding_strategy == PaddingStrategy.BIGGEST:
+            max_resolution = max(len(inputs) for inputs in required_input)
+            padding_strategy = PaddingStrategy.MAX_RESOLUTION
+
+        batch_outputs = {}
+        for i in range(batch_size):
+            inputs = dict((k, v[i]) for k, v in processed_images.items())
+            outputs = self._pad(
+                inputs,
+                max_resolution=max_resolution,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_pixel_mask=return_pixel_mask,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        return BatchImages(batch_outputs, tensor_type=return_tensors)
+
+    def _pad(
+        self,
+        processed_images: Union[Dict[str, List[float]], BatchImages],
+        max_resolution: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_pixel_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad inputs (up to predefined resolution or max resolution in the batch)
+
+
+        Args:
+            processed_images: Dictionary of input values (`List[float]`) / input vectors (`List[List[float]]`) or batch of inputs values (`List[List[int]]`) / input vectors (`List[List[List[int]]]`)
+            max_resolution: maximum resolution of the returned list and optionally padding length (see below)
+            padding_strategy: PaddingStrategy to use for padding.
+
+
+                - PaddingStrategy.BIGGEST Pad to the biggest image in the batch (default)
+                - PaddingStrategy.MAX_RESOLUTION: Pad to the max resolution
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_pixel_mask: (optional) Set to False to avoid returning pixel mask (default: set to model specifics)
+        """
+        required_input = processed_images[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.BIGGEST:
+            max_resolution = len(required_input)
+
+        if (
+            max_resolution is not None
+            and pad_to_multiple_of is not None
+            and (max_resolution % pad_to_multiple_of != 0)
+        ):
+            max_resolution = ((max_resolution // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_resolution
+
+        if needs_to_be_padded:
+            difference = max_resolution - len(required_input)
+            padding_vector = self.feature_size * [self.padding_value] if self.feature_size > 1 else self.padding_value
+            # if self.padding_side == "right":
+            #     if return_pixel_mask:
+            #         processed_images["pixel_mask"] = [1] * len(required_input) + [0] * difference
+            #     processed_images[self.model_input_names[0]] = required_input + [
+            #         padding_vector for _ in range(difference)
+            #     ]
+            # elif self.padding_side == "left":
+            #     if return_pixel_mask:
+            #         processed_images["pixel_mask"] = [0] * difference + [1] * len(required_input)
+            #     processed_images[self.model_input_names[0]] = [
+            #         padding_vector for _ in range(difference)
+            #     ] + required_input
+            # else:
+            #     raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+        elif return_pixel_mask and "pixel_mask" not in processed_images:
+            processed_images["pixel_mask"] = [1] * len(required_input)
+
+        return processed_images
+
+    def _get_padding_strategies(self, padding=False, max_resolution=None, pad_to_multiple_of=None, **kwargs):
+        """
+        Find the correct padding strategy
+        """
+
+        # Get padding strategy
+        if padding is not False:
+            if padding is True:
+                padding_strategy = PaddingStrategy.BIGGEST  # Default to pad to the biggest image in the batch
+            elif not isinstance(padding, PaddingStrategy):
+                padding_strategy = PaddingStrategy(padding)
+            elif isinstance(padding, PaddingStrategy):
+                padding_strategy = padding
+        else:
+            padding_strategy = PaddingStrategy.DO_NOT_PAD
+
+        # Set max resolution if needed
+        if max_resolution is None:
+            if padding_strategy == PaddingStrategy.MAX_RESOLUTION:
+                raise ValueError(
+                    f"When setting ``padding={PaddingStrategy.MAX_RESOLUTION}``, make sure that"
+                    f" max_resolution is defined"
+                )
+
+        # Test if we have a padding value
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None):
+            raise ValueError(
+                "Asking to pad but the image_processor does not have a padding value. "
+                "Please select a value to use as `padding_value`. For example: `image_processor.padding_value = 0.0`."
+            )
+
+        return padding_strategy, max_resolution, kwargs
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index ca371d804ca389..9dd8101928f16e 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 
 from . import (
+    vit,
     albert,
     auto,
     bart,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c28d3190dce2ce..2a266f32c9f6dd 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -19,6 +19,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ..albert.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+from ..vit.configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
 from ..bart.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
 from ..bert.configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
 from ..bert_generation.configuration_bert_generation import BertGenerationConfig
@@ -80,7 +81,11 @@
     (key, value)
     for pretrained_map in [
         # Add archive maps here
+<<<<<<< HEAD
         SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+=======
+        VIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+>>>>>>> 8352309bd... First commit - copy from modeling_vit_pytorch
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -128,6 +133,7 @@
     [
         # Add configs here
         ("speech_to_text", Speech2TextConfig),
+        ("vit", ViTConfig),
         ("wav2vec2", Wav2Vec2Config),
         ("m2m_100", M2M100Config),
         ("convbert", ConvBertConfig),
@@ -180,7 +186,11 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+<<<<<<< HEAD
         ("speech_to_text", "Speech2Text"),
+=======
+        ("vit", "ViT"),
+>>>>>>> 8352309bd... First commit - copy from modeling_vit_pytorch
         ("wav2vec2", "Wav2Vec2"),
         ("m2m_100", "M2M100"),
         ("convbert", "ConvBERT"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a78a974573744f..d81d652b1d417e 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -226,6 +226,10 @@
     TapasModel,
 )
 from ..transfo_xl.modeling_transfo_xl import TransfoXLForSequenceClassification, TransfoXLLMHeadModel, TransfoXLModel
+from ..vit.modeling_vit import (
+    ViTForImageClassification,
+    ViTModel,
+)
 from ..wav2vec2.modeling_wav2vec2 import Wav2Vec2ForMaskedLM, Wav2Vec2Model
 from ..xlm.modeling_xlm import (
     XLMForMultipleChoice,
@@ -258,6 +262,7 @@
     XLNetModel,
 )
 from .configuration_auto import (
+    ViTConfig,
     AlbertConfig,
     AutoConfig,
     BartConfig,
@@ -316,6 +321,7 @@
     [
         # Base model mapping
         (Speech2TextConfig, Speech2TextModel),
+        (ViTConfig, ViTModel),
         (Wav2Vec2Config, Wav2Vec2Model),
         (M2M100Config, M2M100Model),
         (ConvBertConfig, ConvBertModel),
diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
new file mode 100644
index 00000000000000..3aaf327aebc929
--- /dev/null
+++ b/src/transformers/models/vit/__init__.py
@@ -0,0 +1,67 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...file_utils import _BaseLazyModule, is_torch_available, is_tokenizers_available
+_import_structure = {
+    "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
+    "image_processor_vit": ["ViTImageProcessor"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_vit"] = [
+        "VIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "ViTForImageClassification",
+        "ViTLayer",
+        "ViTModel",
+        "ViTPreTrainedModel",
+        "load_tf_weights_in_vit",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
+    from .image_processor_vit import ViTImageProcessor
+
+    if is_torch_available():
+        from .modeling_vit import (
+            VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTForImageClassification,
+            ViTLayer,
+            ViTModel,
+            ViTPreTrainedModel,
+            load_tf_weights_in_vit,
+        )
+
+
+else:
+    import importlib
+    import os
+    import sys
+
+    class _LazyModule(_BaseLazyModule):
+        """
+        Module class that surfaces all objects but only performs associated imports when the objects are requested.
+        """
+
+        __file__ = globals()["__file__"]
+        __path__ = [os.path.dirname(__file__)]
+
+        def _get_module(self, module_name: str):
+            return importlib.import_module("." + module_name, self.__name__)
+
+    sys.modules[__name__] = _LazyModule(__name__, _import_structure)
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
new file mode 100644
index 00000000000000..40894f47d6196d
--- /dev/null
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ViT model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "vit-base-patch16-224": "https://huggingface.co/vit-base-patch16-224/resolve/main/config.json",
+    # See all ViT models at https://huggingface.co/models?filter=vit
+}
+
+
+class ViTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.ViTModel`.
+    It is used to instantiate an ViT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ViT `vit-base-patch16-224 <https://huggingface.co/vit-base-patch16-224>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the ViT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.ViTModel` or
+            :class:`~transformers.TFViTModel`.
+            Vocabulary size of the  model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ViTModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ViTModel` or
+            :class:`~transformers.TFViTModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        img_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+            The size (resolution) of each image.
+        patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
+            The size (resolution) of each patch.
+        in_chans (:obj:`int`, `optional`, defaults to :obj:`3`):
+            The number of input channels.
+
+        Example::
+
+        >>> from transformers import ViTModel, ViTConfig
+
+        >>> # Initializing a ViT vit-base-patch16-224 style configuration
+        >>> configuration = ViTConfig()
+
+        >>> # Initializing a model from the vit-base-patch16-224 style configuration
+        >>> model = ViTModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "vit"
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_cache=True,
+        is_encoder_decoder=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        
\ No newline at end of file
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
new file mode 100644
index 00000000000000..296d553b3034fe
--- /dev/null
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViT checkpoints from the timm library."""
+
+
+import argparse
+import os
+from pathlib import Path
+
+import torch
+from packaging import version
+from torch import nn
+import timm 
+
+from PIL import Image
+import requests
+import torchvision.transforms as T
+
+from transformers import (
+    ViTConfig,
+    ViTModel,
+    ViTForImageClassification,
+)
+from transformers.utils import logging
+
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+    rename_keys = []
+    for i in range(config.num_hidden_layers):  
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append(("blocks." + str(i) + ".norm1.weight", "vit.encoder.layer." + str(i) + ".layernorm_before.weight"))
+        rename_keys.append(("blocks." + str(i) + ".norm1.bias", "vit.encoder.layer." + str(i) + ".layernorm_before.bias"))
+        rename_keys.append(("blocks." + str(i) + ".attn.proj.weight", "vit.encoder.layer." + str(i) + ".attention.output.dense.weight"))
+        rename_keys.append(("blocks." + str(i) + ".attn.proj.bias", "vit.encoder.layer." + str(i) + ".attention.output.dense.bias"))
+        rename_keys.append(("blocks." + str(i) + ".norm2.weight", "vit.encoder.layer." + str(i) + ".layernorm_after.weight"))
+        rename_keys.append(("blocks." + str(i) + ".norm2.bias", "vit.encoder.layer." + str(i) + ".layernorm_after.bias"))
+        rename_keys.append(("blocks." + str(i) + ".mlp.fc1.weight", "vit.encoder.layer." + str(i) + ".intermediate.dense.weight"))
+        rename_keys.append(("blocks." + str(i) + ".mlp.fc1.bias", "vit.encoder.layer." + str(i) + ".intermediate.dense.bias"))
+        rename_keys.append(("blocks." + str(i) + ".mlp.fc2.weight", "vit.encoder.layer." + str(i) + ".output.dense.weight"))
+        rename_keys.append(("blocks." + str(i) + ".mlp.fc2.bias", "vit.encoder.layer." + str(i) + ".output.dense.bias"))
+        
+    # projection layer + position embeddings 
+    rename_keys.extend([("cls_token", "vit.embeddings.cls_token"),
+    ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
+    ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
+    ("pos_embed", "vit.embeddings.position_embeddings"),
+    ])
+
+    # classification head
+    rename_keys.extend([("head.weight", "classifier.weight"),
+    ("head.bias", "classifier.bias"),
+    ("norm.weight", "layernorm.weight"),
+    ("norm.bias", "layernorm.bias"),
+    ])
+
+    # to do: add base model support
+    # if just the base model, we should remove "vit" from all keys
+    if base_model:
+        pass
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config, base_model=False):
+    for i in range(config.num_hidden_layers):
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop("blocks." + str(i) + ".attn.qkv.weight")
+        in_proj_bias = state_dict.pop("blocks." + str(i) + ".attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.query.weight"] = in_proj_weight[:config.hidden_size, :]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.query.bias"] = in_proj_bias[:config.hidden_size]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.key.weight"] = in_proj_weight[config.hidden_size:config.hidden_size*2, :]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.key.bias"] = in_proj_bias[config.hidden_size:config.hidden_size*2]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.value.weight"] = in_proj_weight[-config.hidden_size:, :]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.value.bias"] = in_proj_bias[-config.hidden_size:]
+    
+    # to do: add base model support
+    if base_model:
+        pass
+
+
+def remove_classification_head_(state_dict):
+    ignore_keys = [
+        "norm.weight",
+        "norm.bias",
+        "head.weight",
+        "head.bias",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img(image_resolution):
+    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    # standard PyTorch mean-std input image normalization
+    transform = T.Compose([
+        T.Resize((image_resolution,image_resolution)),
+        T.ToTensor(),
+        T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
+    ])
+
+    # mean-std normalize the input image (batch-size: 1)
+    img = transform(im).unsqueeze(0)
+
+    return img
+
+
+@torch.no_grad()
+def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False):
+    """
+    Copy/paste/tweak model's weights to our ViT structure.
+    """
+        
+    # define HuggingFace configuration
+    config = ViTConfig()
+    if vit_name == "vit_base_patch16_224":
+        config.num_labels=1000
+    elif vit_name == "vit_base_patch32_224":
+        config.patch_size = 32
+        config.num_labels=1000
+    elif vit_name == "vit_base_patch16_384":
+        config.img_size = 384
+        config.num_labels=1000
+    elif vit_name == "vit_base_patch32_384":
+        config.img_size = 384
+        config.patch_size = 32
+        config.num_labels=1000
+
+    # load original model from timm
+    vit = timm.create_model(vit_name, pretrained=True)
+    vit.eval()
+    
+    # load state_dict of original model, remove and rename some keys
+    state_dict = vit.state_dict()
+    rename_keys = create_rename_keys(config, base_model)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config, base_model)
+    if base_model:
+        remove_classification_head_(state_dict)
+    
+    model = ViTForImageClassification(config).eval()
+    model.load_state_dict(state_dict)
+
+    # Check logits on an image
+    img = prepare_img(config.img_size)
+    logits = vit(img)
+    outputs = model(img)
+
+    assert logits.shape == outputs.logits.shape
+    assert torch.allclose(logits, outputs.logits, atol=1e-4)
+    
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--vit_name", default='vit_base_patch16_224', type=str, help="Name of the ViT timm model you'd like to convert, currently supports ViT base models."
+    )
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--base_model", default=False, action="store_true", help="Whether to just load the base model without any head.")
+    args = parser.parse_args()
+    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
\ No newline at end of file
diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
new file mode 100644
index 00000000000000..df1e62438cabc8
--- /dev/null
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ViT."""
+
+import numpy as np
+import PIL
+import torch
+import torchvision
+import torchvision.transforms.functional as F
+from torchvision import transforms as T
+
+from typing import Optional, Union
+
+from ...file_utils import PaddingStrategy, TensorType
+from ...image_processor_utils import BatchImages, PreTrainedImageProcessor
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+## BELOW: utilities copied from
+## https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/util/misc.py
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    """
+    Data type that handles different types of inputs (either list of images or list of sequences), and computes the
+    padded output (with masking).
+    """
+
+    def __init__(self, tensors, mask: Optional[torch.Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: Union[List[torch.Tensor], torch.Tensor]):
+    # TODO make this more n
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.zeros((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = True
+    else:
+        raise ValueError("Not supported")
+    return NestedTensor(tensor, mask)
+
+
+class ViTImageProcessor(PreTrainedImageProcessor):
+    r"""
+    Constructs a ViT image processor. This image processor inherits from
+    :class:`~transformers.PreTrainedImageProcessor` which contains most of the main methods. Users should refer to this
+    superclass for more information regarding those methods.
+    Args:
+        image_mean (:obj:`int`, defaults to [0.485, 0.456, 0.406]):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (:obj:`int`, defaults to [0.229, 0.224, 0.225]):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+        padding_value (:obj:`float`, defaults to 0.0):
+            The value that is used to fill the padding values.
+        return_pixel_mask (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not :meth:`~transformers.DetrImageProcessor.__call__` should return :obj:`pixel_mask`.
+        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to resize the input to a certain :obj:`size`.
+        size (:obj:`int`, `optional`, defaults to :obj:`224`):
+            Resize the input image to the given size. Only has an effect if :obj:`resize` is set to :obj:`True`.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.229, 0.224, 0.225],
+        padding_value=0.0,
+        return_pixel_mask=True,
+        do_normalize=True,
+        do_resize=True,
+        size=224,
+        **kwargs
+    ):
+        super().__init__(image_mean=image_mean, image_std=image_std, padding_value=padding_value, **kwargs)
+        self.return_pixel_mask = return_pixel_mask
+        self.do_normalize = do_normalize
+        self.do_resize = do_resize
+        self.size = size
+
+    def __call__(
+        self,
+        images: Union[
+            PIL.Image.Image, np.ndarray, torch.Tensor, List[PIL.Image.Image], List[np.ndarray], List[torch.Tensor]
+        ],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_resolution: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_pixel_mask: Optional[bool] = None,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchImages:
+        """
+        Main method to prepare for the model one or several image(s).
+        Args:
+            images (:obj:`PIL.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, numpy array or a Torch
+                tensor.
+            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+                Activates and controls padding. Accepts the following values:
+                * :obj:`True` or :obj:`'biggest'`: Pad to the biggest image in the batch (or no padding if only a
+                  single image is provided).
+                * :obj:`'max_resolution'`: Pad to a maximum resolution specified with the argument
+                  :obj:`max_resolution` or to the maximum acceptable input resolution for the model if that argument is
+                  not provided.
+                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with images of
+                  different resolutions).
+            max_resolution (:obj:`int`, `optional`):
+                Controls the maximum resolution to use by one of the truncation/padding parameters. If left unset or
+                set to :obj:`None`, this will use the predefined model maximum resolution if a maximum resolution is
+                required by one of the truncation/padding parameters. If the model has no specific maximum input
+                resolution, truncation/padding to a maximum resolution will be deactivated.
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+            return_pixel_mask (:obj:`bool`, `optional`):
+                Whether to return the pixel mask. If left to the default, will return the pixel mask according
+                to the specific image processor's default.
+                `What are pixel masks? <../glossary.html#attention-mask>`__
+            return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
+                If set, will return tensors instead of list of python floats. Acceptable values are:
+                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to print more information and warnings.
+        """
+        # Input type checking for clearer error
+        assert (
+            isinstance(images, PIL.Image.Image)
+            or isinstance(images, np.ndarray)
+            or isinstance(images, torch.Tensor)
+            or (
+                (
+                    isinstance(images, (list, tuple))
+                    and (
+                        len(images) == 0
+                        or (
+                            isinstance(images[0], PIL.Image.Image)
+                            or isinstance(images[0], np.ndarray)
+                            or isinstance(images[0], torch.Tensor)
+                        )
+                    )
+                )
+            )
+        ), (
+            "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
+            "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]`(batch of examples)."
+        )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple)) and (isinstance(images[0], (PIL.Image.Image, np.ndarray, torch.Tensor)))
+        )
+
+        # step 1: make images a list of PIL images no matter what
+        if is_batched:
+            if isinstance(images[0], np.ndarray):
+                images = [Image.fromarray(image) for image in images]
+            elif isinstance(images[0], torch.Tensor):
+                images = [T.ToPILImage()(image).convert("RGB") for image in images]
+            if annotations is not None:
+                assert len(images) == len(annotations)
+        else:
+            if isinstance(images, PIL.Image.Image):
+                images = [images]
+            if annotations is not None:
+                annotations = [annotations]
+
+        # step 2: define transformations (resizing + normalization)
+        transformations = []
+        if self.do_resize and self.size is not None:
+            transformations.append(T.Resize(size=self.size))
+        if self.do_normalize:
+            normalization = T.Compose([T.ToTensor(), T.Normalize(self.image_mean, self.image_std)])
+            transformations.append(normalization)
+        transforms = T.Compose(transformations)
+
+        # step 3: apply transformations to images
+        transformed_images = [transforms(image) for image in images]
+
+        # step 4: TO DO: replace by self.pad (which is defined in image_processor_utils.py), which takes
+        # care of padding + creating pixel mask
+        samples = nested_tensor_from_tensor_list(transformed_images)
+
+        # return as BatchImages
+        data = {"pixel_values": samples.tensors, "pixel_mask": samples.mask}
+
+        encoded_inputs = BatchImages(data=data)
+
+        return encoded_inputs
\ No newline at end of file
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
new file mode 100644
index 00000000000000..0951884a01ebf6
--- /dev/null
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -0,0 +1,1102 @@
+# coding=utf-8
+# Copyright 2021 Google AI The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViT model. """
+
+
+import math
+import os
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import (
+    PreTrainedModel,
+    SequenceSummary,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ...utils import logging
+from .configuration_vit import ViTConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ViTConfig"
+_TOKENIZER_FOR_DOC = "ViTTokenizer"
+
+VIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "vit-base-patch16-224",
+    # See all ViT models at https://huggingface.co/models?filter=vit
+]
+
+
+def load_tf_weights_in_vit(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def mish(x):
+    return x * torch.tanh(nn.functional.softplus(x))
+
+
+""" Layer/Module Helpers
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from itertools import repeat
+import collections.abc
+
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+
+to_2tuple = _ntuple(2)
+
+
+class ViTEmbeddings(nn.Module):
+    """Construct the cls token, position and patch embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.patch_embeddings = PatchEmbeddings(
+                img_size=config.img_size, patch_size=config.patch_size, in_chans=config.in_chans, embed_dim=config.hidden_size)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, images):
+        batch_size = images.shape[0]
+        embeddings = self.patch_embeddings(images)
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        embeddings = embeddings + self.position_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class PatchEmbeddings(nn.Module):
+    """ Image to Patch Embedding.
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.projection(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class ViTSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # print("Hidden states before self-attention:")
+        # print(hidden_states[0,:3,:3])
+        
+        # print("Queries:")
+        # print(query_layer[0,0,:3,:3])
+
+        # print("Keys:")
+        # print(key_layer[0,0,:3,:3])
+        
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ViTModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # OK
+        # print("Attention scores before softmax:")
+        # print(attention_scores[0,:3,:3])
+        
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # OK
+        # print("Attention after dropout:")
+        # print(attention_probs[0,:3,:3])
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        #print("Hidden states after self-attention:")
+        #print(context_layer.shape)
+        #print(context_layer[0,:3,:3,:3])
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class ViTSelfOutput(nn.Module):
+    """ The residual connection is defined in VitLayer instead of here (as is the case with our models),
+        due to the layernorm applied before each block. """
+    
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        #self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        #hidden_states = self.LayerNorm(hidden_states + input_tensor)
+
+        #print("Hidden states after dense + dropout:")
+        #print(hidden_states[0,:3,:3])
+
+        # first residual connection
+        #hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class ViTAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = ViTSelfAttention(config)
+        self.output = ViTSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+
+        #print("Hidden states after self-attention:")
+        #print(hidden_states[0,:3,:3])
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class ViTIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        
+        #print("Hidden states before intermediate:")
+        #print(hidden_states[0,:3,:3])
+        
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        #print("Hidden states after intermediate:")
+        #print(hidden_states[0,:3,:3])
+
+        return hidden_states
+
+
+class ViTOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        #self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        #hidden_states = self.LayerNorm(hidden_states + input_tensor)
+
+        #print("Hidden states after fc2:")
+        #print(hidden_states[0,:3,:3])
+
+        hidden_states = hidden_states + input_tensor
+
+        #print("Hidden states after adding second residual connection:")
+        #print(hidden_states[0,:3,:3])
+
+        return hidden_states
+
+
+class ViTLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+    
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = ViTAttention(config)
+        self.intermediate = ViTIntermediate(config)
+        self.output = ViTOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None    
+        
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states), # in ViT, layernorm is applied before self-attention
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+        
+        #print("Hidden states before second layernorm:")
+        #print(hidden_states[0,:3,:3])
+        
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)   
+        
+        #print("Hidden states after second layer norm:")
+        #print(layer_output[0,:3,:3]) 
+        
+        # feedforward chunking not working for now
+        # layer_output = apply_chunking_to_forward(
+        #     self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, layer_output
+        # )
+
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+        
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output)
+        return layer_output
+
+
+class ViTEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                #print("Hidden states before layer:", i)
+                #print(hidden_states[0,:3,:3])
+                
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+                #print("Hidden states after layer:", i)
+                #print(hidden_states[0,:3,:3])
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class ViTPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class ViTLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = ViTPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class ViTOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = ViTLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class ViTPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    config_class = ViTConfig
+    load_tf_weights = load_tf_weights_in_vit
+    base_model_prefix = "vit"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+VIT_START_DOCSTRING = r"""
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.ViTConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+VIT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.ViTTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+            
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+            
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+            
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViT Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_START_DOCSTRING,
+)
+class ViTModel(ViTPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    :obj:`is_decoder` argument of the configuration set to :obj:`True`.
+    To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ViTEmbeddings(config)
+        self.encoder = ViTEncoder(config)
+
+        self.init_weights()
+
+    def get_patch_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def set_patch_embeddings(self, value):
+        self.embeddings.patch_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="vit-base-patch16-224",
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        images=None,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # if self.config.is_decoder:
+        #     use_cache = use_cache if use_cache is not None else self.config.use_cache
+        # else:
+        #     use_cache = False
+
+        # if input_ids is not None and inputs_embeds is not None:
+        #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        # elif input_ids is not None:
+        #     input_shape = input_ids.size()
+        #     batch_size, seq_length = input_shape
+        # elif inputs_embeds is not None:
+        #     input_shape = inputs_embeds.size()[:-1]
+        #     batch_size, seq_length = input_shape
+        # else:
+        #     raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        # device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # # past_key_values_length
+        # past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+
+        # if attention_mask is None:
+        #     attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        # if token_type_ids is None:
+        #     token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # # ourselves in which case we just need to make it broadcastable to all heads.
+        # extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # # If a 2D or 3D attention mask is provided for the cross-attention
+        # # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        # if self.config.is_decoder and encoder_hidden_states is not None:
+        #     encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+        #     encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+        #     if encoder_attention_mask is None:
+        #         encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+        #     encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        # else:
+        #     encoder_extended_attention_mask = None
+
+        # # Prepare head mask if needed
+        # # 1.0 in head_mask indicate we keep the head
+        # # attention_probs has shape bsz x n_heads x N x N
+        # # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        # head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            images,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=None, # replaced extended_attention_mask
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=None, # replaced encoder_extended_attention_mask
+            past_key_values=past_key_values, 
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+# class ViTClassificationHead(nn.Module):
+#     """Head for image classification tasks."""
+
+#     def __init__(self, config):
+#         super().__init__()
+#         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+#         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+#         self.config = config
+
+#     def forward(self, features, **kwargs):
+#         x = features[:, 0, :]  # take [CLS] token 
+#         x = self.dropout(x)
+#         x = self.dense(x)
+#         x = ACT2FN[self.config.hidden_act](x)
+#         x = self.dropout(x)
+#         x = self.out_proj(x)
+#         return x
+
+
+@add_start_docstrings(
+    """ViT Model transformer with an image classification head on top (a linear layer on top of
+    the pooled output) e.g. for ImageNet. """,
+    VIT_START_DOCSTRING,
+)
+class ViTForImageClassification(ViTPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        
+        self.num_labels = config.num_labels
+        self.vit = ViTModel(config)
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint="vit-base-patch16-224",
+    #     output_type=SequenceClassifierOutput,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+            self,
+            images=None,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            images,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.layernorm(sequence_output[:, 0, :])
+        logits = self.classifier(sequence_output)
+
+        #print("Logits:")
+        #print(logits[0,:3])
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
\ No newline at end of file
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
new file mode 100644
index 00000000000000..8216318be2447f
--- /dev/null
+++ b/tests/test_modeling_vit.py
@@ -0,0 +1,478 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViT model. """
+
+
+import unittest
+
+from tests.test_modeling_common import floats_tensor
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        ViTConfig,
+        ViTForCausalLM,
+        ViTForMaskedLM,
+        ViTForMultipleChoice,
+        ViTForQuestionAnswering,
+        ViTForSequenceClassification,
+        ViTForTokenClassification,
+        ViTModel,
+    )
+    from transformers.models.vit.modeling_vit import (
+        VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+class ViTModelTester:
+    def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = ViTConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ViTModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = ViTModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        model = ViTForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ViTForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = ViTForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ViTForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = ViTForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = ViTForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = ViTForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class ViTModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            ViTModel,
+            ViTForMaskedLM,
+            ViTForCausalLM,
+            ViTForMultipleChoice,
+            ViTForQuestionAnswering,
+            ViTForSequenceClassification,
+            ViTForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (ViTForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = ViTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ViTConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ViTModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class ViTModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = ViTForMaskedLM.from_pretrained("google/vit_small_patch16_224")
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        # TODO Replace vocab size
+        vocab_size = 32000
+
+        expected_shape = torch.Size((1, 6, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        # TODO Replace values below with what was printed above.
+        expected_slice = torch.tensor(
+            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+

From a8d48c20a99815b009ade161f1a5fcf34f0b7831 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 3 Mar 2021 15:50:08 +0100
Subject: [PATCH 02/44] Add List typing hint

---
 src/transformers/models/vit/image_processor_vit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index df1e62438cabc8..987cfabc037b31 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -21,7 +21,7 @@
 import torchvision.transforms.functional as F
 from torchvision import transforms as T
 
-from typing import Optional, Union
+from typing import Optional, Union, List
 
 from ...file_utils import PaddingStrategy, TensorType
 from ...image_processor_utils import BatchImages, PreTrainedImageProcessor

From 836346948924aec14a04377fab115a2047db33e6 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 3 Mar 2021 15:53:19 +0100
Subject: [PATCH 03/44] Remove annotations

---
 src/transformers/models/vit/image_processor_vit.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index 987cfabc037b31..9cba8bb32a373c 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -218,13 +218,9 @@ def __call__(
                 images = [Image.fromarray(image) for image in images]
             elif isinstance(images[0], torch.Tensor):
                 images = [T.ToPILImage()(image).convert("RGB") for image in images]
-            if annotations is not None:
-                assert len(images) == len(annotations)
         else:
             if isinstance(images, PIL.Image.Image):
                 images = [images]
-            if annotations is not None:
-                annotations = [annotations]
 
         # step 2: define transformations (resizing + normalization)
         transformations = []

From f4e4fb39747a57892da3150b65b5696221584e41 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 3 Mar 2021 16:03:07 +0100
Subject: [PATCH 04/44] Potential bug fix

---
 src/transformers/models/vit/image_processor_vit.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index 9cba8bb32a373c..e30a2313bebb5e 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -219,13 +219,17 @@ def __call__(
             elif isinstance(images[0], torch.Tensor):
                 images = [T.ToPILImage()(image).convert("RGB") for image in images]
         else:
-            if isinstance(images, PIL.Image.Image):
+            if isinstance(images, np.ndarray):
+                images = [Image.fromarray(images)]
+            elif isinstance(images, torch.Tensor)
+                images = [T.ToPILImage()(images).convert("RGB")]
+            else:
                 images = [images]
 
         # step 2: define transformations (resizing + normalization)
         transformations = []
         if self.do_resize and self.size is not None:
-            transformations.append(T.Resize(size=self.size))
+            transformations.append(T.Resize(size=(self.size, self.size)))
         if self.do_normalize:
             normalization = T.Compose([T.ToTensor(), T.Normalize(self.image_mean, self.image_std)])
             transformations.append(normalization)

From ff97a920addecb841c65a48eb4f5bd4f8d98ae2a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 3 Mar 2021 16:04:16 +0100
Subject: [PATCH 05/44] Bug fix

---
 src/transformers/models/vit/image_processor_vit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index e30a2313bebb5e..dfb1aa7e0411dc 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -221,7 +221,7 @@ def __call__(
         else:
             if isinstance(images, np.ndarray):
                 images = [Image.fromarray(images)]
-            elif isinstance(images, torch.Tensor)
+            elif isinstance(images, torch.Tensor):
                 images = [T.ToPILImage()(images).convert("RGB")]
             else:
                 images = [images]

From 46ea2b693749410f325e2bb65913ea9b70cbd234 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 3 Mar 2021 16:16:28 +0100
Subject: [PATCH 06/44] Rename inputs to pixel_values

---
 .../models/vit/image_processor_vit.py            |  4 ++--
 src/transformers/models/vit/modeling_vit.py      | 16 +++++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index dfb1aa7e0411dc..6c0bd29cc209f5 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -238,8 +238,8 @@ def __call__(
         # step 3: apply transformations to images
         transformed_images = [transforms(image) for image in images]
 
-        # step 4: TO DO: replace by self.pad (which is defined in image_processor_utils.py), which takes
-        # care of padding + creating pixel mask
+        # step 4: TO DO: replace by self.pad (which is defined in image_processor_utils.py), which should 
+        # take care of padding + creating pixel mask
         samples = nested_tensor_from_tensor_list(transformed_images)
 
         # return as BatchImages
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 0951884a01ebf6..803cc1a5d182bb 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -171,9 +171,9 @@ def __init__(self, config):
         self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, images):
-        batch_size = images.shape[0]
-        embeddings = self.patch_embeddings(images)
+    def forward(self, pixel_values):
+        batch_size = pixel_values.shape[0]
+        embeddings = self.patch_embeddings(pixel_values)
 
         cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
@@ -871,7 +871,8 @@ def _prune_heads(self, heads_to_prune):
     )
     def forward(
         self,
-        images=None,
+        pixel_values=None,
+        pixel_mask=None,
         input_ids=None,
         attention_mask=None,
         token_type_ids=None,
@@ -962,7 +963,7 @@ def forward(
         # head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
         embedding_output = self.embeddings(
-            images,
+            pixel_values,
         )
 
         encoder_outputs = self.encoder(
@@ -1038,7 +1039,8 @@ def __init__(self, config):
     # )
     def forward(
             self,
-            images=None,
+            pixel_values=None,
+            pixel_mask=None,
             input_ids=None,
             attention_mask=None,
             token_type_ids=None,
@@ -1060,7 +1062,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.vit(
-            images,
+            pixel_values,
             input_ids=input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,

From b6fba1c60b7dca8b26995cd1d12c82d01796fa0f Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 3 Mar 2021 20:25:50 +0100
Subject: [PATCH 07/44] First draft of ImageProcessor tests

---
 .../models/vit/image_processor_vit.py         |   8 +-
 tests/test_image_processor_common.py          | 160 ++++++++++++++++++
 tests/test_image_processor_vit.py             | 132 +++++++++++++++
 3 files changed, 296 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_image_processor_common.py
 create mode 100644 tests/test_image_processor_vit.py

diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index 6c0bd29cc209f5..3ddf64c3c11ed8 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -115,7 +115,7 @@ class ViTImageProcessor(PreTrainedImageProcessor):
         do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether to resize the input to a certain :obj:`size`.
         size (:obj:`int`, `optional`, defaults to :obj:`224`):
-            Resize the input image to the given size. Only has an effect if :obj:`resize` is set to :obj:`True`.
+            Resize the input to the given size. Only has an effect if :obj:`resize` is set to :obj:`True`.
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
@@ -205,7 +205,7 @@ def __call__(
             )
         ), (
             "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
-            "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]`(batch of examples)."
+            "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
         )
 
         is_batched = bool(
@@ -215,12 +215,12 @@ def __call__(
         # step 1: make images a list of PIL images no matter what
         if is_batched:
             if isinstance(images[0], np.ndarray):
-                images = [Image.fromarray(image) for image in images]
+                images = [Image.fromarray(image).convert("RGB") for image in images]
             elif isinstance(images[0], torch.Tensor):
                 images = [T.ToPILImage()(image).convert("RGB") for image in images]
         else:
             if isinstance(images, np.ndarray):
-                images = [Image.fromarray(images)]
+                images = [Image.fromarray(images).convert("RGB")]
             elif isinstance(images, torch.Tensor):
                 images = [T.ToPILImage()(images).convert("RGB")]
             else:
diff --git a/tests/test_image_processor_common.py b/tests/test_image_processor_common.py
new file mode 100644
index 00000000000000..f9797529bdf428
--- /dev/null
+++ b/tests/test_image_processor_common.py
@@ -0,0 +1,160 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import tempfile
+
+import numpy as np
+
+from transformers import BatchImages
+from transformers.testing_utils import require_tf, require_torch
+
+
+class ImageProcessorMixin:
+
+    # to overwrite at image processor specific tests
+    image_processor_tester = None
+    image_processor_class = None
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_common_properties(self):
+        image_processor = self.image_processor_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processor, "image_mean"))
+        self.assertTrue(hasattr(image_processor, "image_std"))
+        self.assertTrue(hasattr(image_processor, "padding_value"))
+
+    def test_image_processor_to_json_string(self):
+        image_processor = self.image_processor_class(**self.image_processor_dict)
+        obj = json.loads(image_processor.to_json_string())
+        for key, value in self.image_processor_dict.items():
+            self.assertEqual(obj[key], value)
+
+    def test_image_processor_to_json_file(self):
+        image_processor_first = self.image_processor_class(**self.image_processor_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "image_processor.json")
+            image_processor_first.to_json_file(json_file_path)
+            image_processor_second = self.image_processor_class.from_json_file(json_file_path)
+
+        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+
+    def test_image_processor_from_and_save_pretrained(self):
+        image_processor_first = self.image_processor_class(**self.image_processor_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            image_processor_first.save_pretrained(tmpdirname)
+            image_processor_second = self.image_processor_class.from_pretrained(tmpdirname)
+
+        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+
+    def test_init_without_params(self):
+        image_processor = self.image_processor_class()
+        self.assertIsNotNone(image_processor)
+
+    def test_batch_images(self):
+        image_inputs = self.image_processor_tester.prepare_inputs_for_common()
+        image_processor = self.image_processor_class(**self.image_processor_dict)
+        input_name = image_processor.model_input_names[0]
+
+        processed_images = BatchImages({input_name: image_inputs})
+
+        self.assertTrue(all(len(x) == len(y) for x, y in zip(image_inputs, processed_images[input_name])))
+
+        image_inputs = self.image_processor_tester.prepare_inputs_for_common(equal_length=True)
+        processed_images = BatchImages({input_name: image_inputs}, tensor_type="np")
+
+        batch_images_input = processed_images[input_name]
+
+        if len(batch_images_input.shape) < 3:
+            batch_images_input = batch_images_input[:, :, None]
+
+        # self.assertTrue(
+        #     batch_images_input.shape
+        #     == (self.image_processor_tester.batch_size, len(image_inputs[0]), self.image_processor_tester.feature_size)
+        # )
+
+    @require_torch
+    def test_batch_images_pt(self):
+        image_inputs = self.image_processor_tester.prepare_inputs_for_common(equal_length=True)
+        image_processor = self.image_processor_class(**self.image_processor_dict)
+        input_name = image_processor.model_input_names[0]
+
+        processed_images = BatchImages({input_name: image_inputs}, tensor_type="pt")
+
+        batch_images_input = processed_images[input_name]
+
+        if len(batch_images_input.shape) < 3:
+            batch_images_input = batch_images_input[:, :, None]
+
+        # self.assertTrue(
+        #     batch_images_input.shape
+        #     == (self.image_processor_tester.batch_size, len(image_inputs[0]), self.image_processor_tester.feature_size)
+        # )
+
+    @require_tf
+    def test_batch_images_tf(self):
+        image_inputs = self.image_processor_tester.prepare_inputs_for_common(equal_length=True)
+        image_processor = self.image_processor_class(**self.image_processor_dict)
+        input_name = image_processor.model_input_names[0]
+
+        processed_images = BatchImages({input_name: image_inputs}, tensor_type="tf")
+
+        batch_images_input = processed_images[input_name]
+
+        if len(batch_images_input.shape) < 3:
+            batch_images_input = batch_images_input[:, :, None]
+
+        # self.assertTrue(
+        #     batch_images_input.shape
+        #     == (self.image_processor_tester.batch_size, len(image_inputs[0]), self.image_processor_tester.feature_size)
+        # )
+
+    def _check_padding(self, numpify=False):
+        pass
+
+    def test_padding_from_list(self):
+        self._check_padding(numpify=False)
+
+    def test_padding_from_array(self):
+        self._check_padding(numpify=True)
+
+    @require_torch
+    def test_padding_accepts_tensors_pt(self):
+        pass
+
+    @require_tf
+    def test_padding_accepts_tensors_tf(self):
+        pass
+
+    def test_pixel_mask(self):
+        feat_dict = self.image_processor_dict
+        feat_dict["return_pixel_mask"] = True
+        image_processor = self.image_processor_class(**feat_dict)
+        image_inputs = self.image_processor_tester.prepare_inputs_for_common()
+        input_lenghts = [len(x) for x in image_inputs]
+        input_name = image_processor.model_input_names[0]
+
+        processed = BatchImages({input_name: image_inputs})
+
+        processed = image_processor.pad(processed, padding="biggest", return_tensors="np")
+        self.assertIn("pixel_mask", processed)
+        self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
+        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
\ No newline at end of file
diff --git a/tests/test_image_processor_vit.py b/tests/test_image_processor_vit.py
new file mode 100644
index 00000000000000..ed45cfe18a3317
--- /dev/null
+++ b/tests/test_image_processor_vit.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+import random
+import unittest
+
+import numpy as np
+
+from transformers import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, ViTConfig, ViTImageProcessor
+from transformers.testing_utils import slow
+
+from .test_image_processor_common import ImageProcessorMixin
+
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+class ViTImageProcessorTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_resolution=400,
+        max_resolution=2000,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.5, 0.5, 0.5],
+        padding_value=0.0,
+        return_pixel_mask=True,
+        do_normalize=True,
+        do_resize=True,
+        size=18,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.padding_value = padding_value
+        self.return_pixel_mask = return_pixel_mask
+        self.do_normalize = do_normalize
+        self.do_resize = do_resize
+        self.size = size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "padding_value": self.padding_value,
+            "return_pixel_mask": self.return_pixel_mask,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def prepare_inputs_for_common(self, equal_resolution=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_resolution:
+            image_inputs = floats_list((self.batch_size, self.max_seq_length))
+        else:
+            image_inputs = [
+                _flatten(floats_list((x, self.feature_size)))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+
+        if numpify:
+            image_inputs = [np.asarray(x) for x in image_inputs]
+
+        return image_inputs
+
+
+class ViTImageProcessorTest(ImageProcessorMixin, unittest.TestCase):
+
+    image_processor_class = ViTImageProcessor
+
+    def setUp(self):
+        self.image_processor_tester = VitImageProcessorTester(self)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        image_processor = self.image_processor_class(**self.image_processor_tester.prepare_image_processor_dict())
+        # create three inputs of resolution 800, 1000, and 1200
+        image_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_image_inputs = [np.asarray(speech_input) for speech_input in image_inputs]
+
+        # Test not batched input
+        encoded_sequences_1 = image_processor(image_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = image_processor(np_image_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = image_processor(image_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = image_processor(np_image_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_normalization(self):
+        pass
+
+    @slow
+    def test_pretrained_checkpoints_are_set_correctly(self):
+        pass
\ No newline at end of file

From bc6f12dba9c2370162d513653edc9f0ad61a3465 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 4 Mar 2021 09:43:22 +0100
Subject: [PATCH 08/44] Clean up: remove print statements, remove unused
 variables

---
 .../models/vit/configuration_vit.py           |  12 +-
 .../models/vit/convert_vit_timm_to_pytorch.py |   8 +-
 src/transformers/models/vit/modeling_vit.py   | 243 +++---------------
 tests/test_image_processor_vit.py             |  62 +++--
 4 files changed, 94 insertions(+), 231 deletions(-)

diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index 40894f47d6196d..a787baa084f931 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -75,11 +75,11 @@ class ViTConfig(PretrainedConfig):
             relevant if ``config.is_decoder=True``.
         gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
-        img_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+        image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
             The size (resolution) of each image.
         patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
             The size (resolution) of each patch.
-        in_chans (:obj:`int`, `optional`, defaults to :obj:`3`):
+        num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
             The number of input channels.
 
         Example::
@@ -115,9 +115,9 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        img_size=224,
+        image_size=224,
         patch_size=16,
-        in_chans=3,
+        num_channels=3,
         **kwargs
     ):
         super().__init__(
@@ -141,7 +141,7 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache
 
-        self.img_size = img_size
+        self.image_size = image_size
         self.patch_size = patch_size
-        self.in_chans = in_chans
+        self.num_channels = num_channels
         
\ No newline at end of file
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index 296d553b3034fe..37443c9765ba89 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -146,10 +146,10 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
         config.patch_size = 32
         config.num_labels=1000
     elif vit_name == "vit_base_patch16_384":
-        config.img_size = 384
+        config.image_size = 384
         config.num_labels=1000
     elif vit_name == "vit_base_patch32_384":
-        config.img_size = 384
+        config.image_size = 384
         config.patch_size = 32
         config.num_labels=1000
 
@@ -170,7 +170,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     model.load_state_dict(state_dict)
 
     # Check logits on an image
-    img = prepare_img(config.img_size)
+    img = prepare_img(config.image_size)
     logits = vit(img)
     outputs = model(img)
 
@@ -188,7 +188,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     parser.add_argument(
         "--vit_name", default='vit_base_patch16_224', type=str, help="Name of the ViT timm model you'd like to convert, currently supports ViT base models."
     )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory.")
     parser.add_argument("--base_model", default=False, action="store_true", help="Whether to just load the base model without any head.")
     args = parser.parse_args()
     convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
\ No newline at end of file
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 803cc1a5d182bb..d96880d5658ca1 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -61,84 +61,6 @@
 ]
 
 
-def load_tf_weights_in_vit(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def mish(x):
-    return x * torch.tanh(nn.functional.softplus(x))
-
-
 """ Layer/Module Helpers
 Hacked together by / Copyright 2020 Ross Wightman
 """
@@ -146,6 +68,8 @@ def mish(x):
 import collections.abc
 
 
+# Copied from 
+# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
 # From PyTorch internals
 def _ntuple(n):
     def parse(x):
@@ -166,7 +90,8 @@ def __init__(self, config):
 
         self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.patch_embeddings = PatchEmbeddings(
-                img_size=config.img_size, patch_size=config.patch_size, in_chans=config.in_chans, embed_dim=config.hidden_size)
+                image_size=config.image_size, patch_size=config.patch_size, num_channels=config.num_channels, 
+                embed_dim=config.hidden_size)
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -183,24 +108,23 @@ def forward(self, pixel_values):
 
 
 class PatchEmbeddings(nn.Module):
-    """ Image to Patch Embedding.
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+    """ Image to Patch Embedding."""
+    def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768):
         super().__init__()
-        img_size = to_2tuple(img_size)
+        image_size = to_2tuple(image_size)
         patch_size = to_2tuple(patch_size)
-        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
-        self.img_size = img_size
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
         self.patch_size = patch_size
         self.num_patches = num_patches
 
-        self.projection = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
 
     def forward(self, x):
         B, C, H, W = x.shape
         # FIXME look at relaxing size constraints
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        assert H == self.image_size[0] and W == self.image_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
         x = self.projection(x).flatten(2).transpose(1, 2)
         return x
 
@@ -281,43 +205,30 @@ def forward(
             # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
             # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_layer, value_layer)
-
-        # print("Hidden states before self-attention:")
-        # print(hidden_states[0,:3,:3])
-        
-        # print("Queries:")
-        # print(query_layer[0,0,:3,:3])
-
-        # print("Keys:")
-        # print(key_layer[0,0,:3,:3])
         
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        # if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+        #     seq_length = hidden_states.size()[1]
+        #     position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+        #     position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+        #     distance = position_ids_l - position_ids_r
+        #     positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+        #     positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+        #     if self.position_embedding_type == "relative_key":
+        #         relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+        #         attention_scores = attention_scores + relative_position_scores
+        #     elif self.position_embedding_type == "relative_key_query":
+        #         relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+        #         relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+        #         attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in ViTModel forward() function)
             attention_scores = attention_scores + attention_mask
-
-        # OK
-        # print("Attention scores before softmax:")
-        # print(attention_scores[0,:3,:3])
         
         # Normalize the attention scores to probabilities.
         attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -326,20 +237,12 @@ def forward(
         # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = self.dropout(attention_probs)
 
-        # OK
-        # print("Attention after dropout:")
-        # print(attention_probs[0,:3,:3])
-
         # Mask heads if we want to
         if head_mask is not None:
             attention_probs = attention_probs * head_mask
 
         context_layer = torch.matmul(attention_probs, value_layer)
 
-        #print("Hidden states after self-attention:")
-        #print(context_layer.shape)
-        #print(context_layer[0,:3,:3,:3])
-
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
@@ -358,17 +261,12 @@ class ViTSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        #self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
         
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
-        #hidden_states = self.LayerNorm(hidden_states + input_tensor)
-
-        #print("Hidden states after dense + dropout:")
-        #print(hidden_states[0,:3,:3])
 
         # first residual connection
         #hidden_states = hidden_states + input_tensor
@@ -421,9 +319,6 @@ def forward(
             output_attentions,
         )
 
-        #print("Hidden states after self-attention:")
-        #print(hidden_states[0,:3,:3])
-
         attention_output = self.output(self_outputs[0], hidden_states)
 
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
@@ -441,15 +336,9 @@ def __init__(self, config):
 
     def forward(self, hidden_states):
         
-        #print("Hidden states before intermediate:")
-        #print(hidden_states[0,:3,:3])
-        
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
-        #print("Hidden states after intermediate:")
-        #print(hidden_states[0,:3,:3])
-
         return hidden_states
 
 
@@ -457,22 +346,14 @@ class ViTOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        #self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
-        #hidden_states = self.LayerNorm(hidden_states + input_tensor)
-
-        #print("Hidden states after fc2:")
-        #print(hidden_states[0,:3,:3])
 
         hidden_states = hidden_states + input_tensor
 
-        #print("Hidden states after adding second residual connection:")
-        #print(hidden_states[0,:3,:3])
-
         return hidden_states
 
 
@@ -550,15 +431,9 @@ def forward(
         # first residual connection
         hidden_states = attention_output + hidden_states
         
-        #print("Hidden states before second layernorm:")
-        #print(hidden_states[0,:3,:3])
-        
         # in ViT, layernorm is also applied after self-attention
         layer_output = self.layernorm_after(hidden_states)   
         
-        #print("Hidden states after second layer norm:")
-        #print(layer_output[0,:3,:3]) 
-        
         # feedforward chunking not working for now
         # layer_output = apply_chunking_to_forward(
         #     self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, layer_output
@@ -637,10 +512,7 @@ def custom_forward(*inputs):
                     encoder_hidden_states,
                     encoder_attention_mask,
                 )
-            else:
-                #print("Hidden states before layer:", i)
-                #print(hidden_states[0,:3,:3])
-                
+            else:     
                 layer_outputs = layer_module(
                     hidden_states,
                     attention_mask,
@@ -651,9 +523,6 @@ def custom_forward(*inputs):
                     output_attentions,
                 )
 
-                #print("Hidden states after layer:", i)
-                #print(hidden_states[0,:3,:3])
-
             hidden_states = layer_outputs[0]
             if use_cache:
                 next_decoder_cache += (layer_outputs[-1],)
@@ -770,44 +639,23 @@ def _init_weights(self, module):
 
 VIT_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.ViTTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            
+        pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+            Pixel values can be obtained using :class:`~transformers.ViTImageProcessor`. See
+            :meth:`transformers.ViTImageProcessor.__call__` for details.
+        
+        attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in ``[0, 1]``:
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
             `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-            
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-            
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
 
-            `What are position IDs? <../glossary.html#position-ids>`_
         head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
             
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
             
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
         output_attentions (:obj:`bool`, `optional`):
             Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
             tensors for more detail.
@@ -872,13 +720,8 @@ def _prune_heads(self, heads_to_prune):
     def forward(
         self,
         pixel_values=None,
-        pixel_mask=None,
-        input_ids=None,
         attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
         head_mask=None,
-        inputs_embeds=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         past_key_values=None,
@@ -1040,13 +883,8 @@ def __init__(self, config):
     def forward(
             self,
             pixel_values=None,
-            pixel_mask=None,
-            input_ids=None,
             attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
             head_mask=None,
-            inputs_embeds=None,
             labels=None,
             output_attentions=None,
             output_hidden_states=None,
@@ -1063,12 +901,8 @@ def forward(
 
         outputs = self.vit(
             pixel_values,
-            input_ids=input_ids,
             attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -1079,9 +913,6 @@ def forward(
         sequence_output = self.layernorm(sequence_output[:, 0, :])
         logits = self.classifier(sequence_output)
 
-        #print("Logits:")
-        #print(logits[0,:3])
-
         loss = None
         if labels is not None:
             if self.num_labels == 1:
diff --git a/tests/test_image_processor_vit.py b/tests/test_image_processor_vit.py
index ed45cfe18a3317..7cdc98c0a39a08 100644
--- a/tests/test_image_processor_vit.py
+++ b/tests/test_image_processor_vit.py
@@ -19,6 +19,7 @@
 import unittest
 
 import numpy as np
+import torch
 
 from transformers import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, ViTConfig, ViTImageProcessor
 from transformers.testing_utils import slow
@@ -48,8 +49,10 @@ def __init__(
         self,
         parent,
         batch_size=7,
-        min_resolution=400,
-        max_resolution=2000,
+        num_channels=3,
+        image_size=224,
+        min_resolution=30,
+        max_resolution=400,
         image_mean=[0.485, 0.456, 0.406],
         image_std=[0.5, 0.5, 0.5],
         padding_value=0.0,
@@ -60,6 +63,8 @@ def __init__(
     ):
         self.parent = parent
         self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
         self.min_resolution = min_resolution
         self.max_resolution = max_resolution
         self.image_mean = image_mean
@@ -81,20 +86,30 @@ def prepare_image_processor_dict(self):
             "size": self.size,
         }
 
-    def prepare_inputs_for_common(self, equal_resolution=False, numpify=False):
+    def prepare_numpy_inputs_for_common(self, equal_resolution=False):
         def _flatten(list_of_lists):
             return list(itertools.chain(*list_of_lists))
 
         if equal_resolution:
-            image_inputs = floats_list((self.batch_size, self.max_seq_length))
+            image_inputs = floats_list((self.batch_size, self.num_channels, self.image_size, self.image_size))
         else:
             image_inputs = [
                 _flatten(floats_list((x, self.feature_size)))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+                for x in range(self.min_resolution, self.max_resolution, self.seq_length_diff)
             ]
 
-        if numpify:
-            image_inputs = [np.asarray(x) for x in image_inputs]
+        image_inputs = [np.asarray(x) for x in image_inputs]
+
+        return image_inputs
+
+    def prepare_pytorch_inputs_for_common(self, equal_resolution=False):
+
+        if equal_resolution:
+            input_size = (self.num_channels, self.image_size, self.image_size)
+            image_inputs = torch.randn((self.batch_size, *input_size))
+        
+        else:
+            
 
         return image_inputs
 
@@ -106,24 +121,41 @@ class ViTImageProcessorTest(ImageProcessorMixin, unittest.TestCase):
     def setUp(self):
         self.image_processor_tester = VitImageProcessorTester(self)
 
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
+    def test_call_numpy(self):
+        # Initialize image_processor
         image_processor = self.image_processor_class(**self.image_processor_tester.prepare_image_processor_dict())
         # create three inputs of resolution 800, 1000, and 1200
         image_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
         np_image_inputs = [np.asarray(speech_input) for speech_input in image_inputs]
 
         # Test not batched input
-        encoded_sequences_1 = image_processor(image_inputs[0], return_tensors="np").input_values
-        encoded_sequences_2 = image_processor(np_image_inputs[0], return_tensors="np").input_values
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+        encoded_images_1 = image_processor(image_inputs[0], return_tensors="np").input_values
+        encoded_images_2 = image_processor(np_image_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_images_1, encoded_images_2, atol=1e-3))
 
         # Test batched
-        encoded_sequences_1 = image_processor(image_inputs, return_tensors="np").input_values
-        encoded_sequences_2 = image_processor(np_image_inputs, return_tensors="np").input_values
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+        encoded_images_1 = image_processor(image_inputs, return_tensors="np").input_values
+        encoded_images_2 = image_processor(np_image_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_images_1, encoded_images_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+    def test_call_pytorch(self):
+        # Initialize image_processor
+        image_processor = self.image_processor_class(**self.image_processor_tester.prepare_image_processor_dict())
+        # create three inputs of resolution 800, 1000, and 1200
+        image_inputs = None
+
+        # Test not batched input
+        encoded_images_1 = image_processor(image_inputs[0], return_tensors="pt").input_values
+        encoded_images_2 = image_processor(np_image_inputs[0], return_tensors="pt").input_values
+        self.assertTrue(np.allclose(encoded_images_1, encoded_images_2, atol=1e-3))
+
+        # Test batched
+        encoded_images_1 = image_processor(image_inputs, return_tensors="pt").input_values
+        encoded_images_2 = image_processor(np_image_inputs, return_tensors="pt").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_images_1, encoded_images_2):
+            self.assertTrue(torch.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+    
     def test_normalization(self):
         pass
 

From 56ccfa83ec8fee84475b52386de711d9f27e1435 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 4 Mar 2021 09:47:40 +0100
Subject: [PATCH 09/44] Remove load_tf_weights_in_vit

---
 src/transformers/models/vit/modeling_vit.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index d96880d5658ca1..cd26eecf426432 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -609,9 +609,7 @@ class ViTPreTrainedModel(PreTrainedModel):
     """
 
     config_class = ViTConfig
-    load_tf_weights = load_tf_weights_in_vit
     base_model_prefix = "vit"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """ Initialize the weights """

From dc3c23fca79dcd84a206249c97032340560ff4d6 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 4 Mar 2021 10:02:41 +0100
Subject: [PATCH 10/44] Rename pixel_mask to attention_mask

---
 .../models/vit/image_processor_vit.py         | 18 +++++-----
 tests/test_image_processor_common.py          |  6 ++--
 tests/test_image_processor_vit.py             | 33 ++++---------------
 3 files changed, 18 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index 3ddf64c3c11ed8..ed03ab6a801f4b 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -108,8 +108,8 @@ class ViTImageProcessor(PreTrainedImageProcessor):
             The sequence of standard deviations for each channel, to be used when normalizing images.
         padding_value (:obj:`float`, defaults to 0.0):
             The value that is used to fill the padding values.
-        return_pixel_mask (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not :meth:`~transformers.DetrImageProcessor.__call__` should return :obj:`pixel_mask`.
+        return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not :meth:`~transformers.ViTImageProcessor.__call__` should return :obj:`attention_mask`.
         do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to normalize the input with mean and standard deviation.
         do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -118,21 +118,21 @@ class ViTImageProcessor(PreTrainedImageProcessor):
             Resize the input to the given size. Only has an effect if :obj:`resize` is set to :obj:`True`.
     """
 
-    model_input_names = ["pixel_values", "pixel_mask"]
+    model_input_names = ["pixel_values", "attention_mask"]
 
     def __init__(
         self,
         image_mean=[0.485, 0.456, 0.406],
         image_std=[0.229, 0.224, 0.225],
         padding_value=0.0,
-        return_pixel_mask=True,
+        return_attention_mask=True,
         do_normalize=True,
         do_resize=True,
         size=224,
         **kwargs
     ):
         super().__init__(image_mean=image_mean, image_std=image_std, padding_value=padding_value, **kwargs)
-        self.return_pixel_mask = return_pixel_mask
+        self.return_attention_mask = return_attention_mask
         self.do_normalize = do_normalize
         self.do_resize = do_resize
         self.size = size
@@ -146,7 +146,7 @@ def __call__(
         max_resolution: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        return_pixel_mask: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
         verbose: bool = True,
         **kwargs
     ) -> BatchImages:
@@ -173,7 +173,7 @@ def __call__(
             pad_to_multiple_of (:obj:`int`, `optional`):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_pixel_mask (:obj:`bool`, `optional`):
+            return_attention_mask (:obj:`bool`, `optional`):
                 Whether to return the pixel mask. If left to the default, will return the pixel mask according
                 to the specific image processor's default.
                 `What are pixel masks? <../glossary.html#attention-mask>`__
@@ -239,11 +239,11 @@ def __call__(
         transformed_images = [transforms(image) for image in images]
 
         # step 4: TO DO: replace by self.pad (which is defined in image_processor_utils.py), which should 
-        # take care of padding + creating pixel mask
+        # take care of padding, creation of attention mask, return_tensors type
         samples = nested_tensor_from_tensor_list(transformed_images)
 
         # return as BatchImages
-        data = {"pixel_values": samples.tensors, "pixel_mask": samples.mask}
+        data = {"pixel_values": samples.tensors, "attention_mask": samples.mask}
 
         encoded_inputs = BatchImages(data=data)
 
diff --git a/tests/test_image_processor_common.py b/tests/test_image_processor_common.py
index f9797529bdf428..f5f694176f932e 100644
--- a/tests/test_image_processor_common.py
+++ b/tests/test_image_processor_common.py
@@ -144,9 +144,9 @@ def test_padding_accepts_tensors_pt(self):
     def test_padding_accepts_tensors_tf(self):
         pass
 
-    def test_pixel_mask(self):
+    def test_attention_mask(self):
         feat_dict = self.image_processor_dict
-        feat_dict["return_pixel_mask"] = True
+        feat_dict["return_attention_mask"] = True
         image_processor = self.image_processor_class(**feat_dict)
         image_inputs = self.image_processor_tester.prepare_inputs_for_common()
         input_lenghts = [len(x) for x in image_inputs]
@@ -155,6 +155,6 @@ def test_pixel_mask(self):
         processed = BatchImages({input_name: image_inputs})
 
         processed = image_processor.pad(processed, padding="biggest", return_tensors="np")
-        self.assertIn("pixel_mask", processed)
+        self.assertIn("attention_mask", processed)
         self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
         self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
\ No newline at end of file
diff --git a/tests/test_image_processor_vit.py b/tests/test_image_processor_vit.py
index 7cdc98c0a39a08..5795a28cdbee10 100644
--- a/tests/test_image_processor_vit.py
+++ b/tests/test_image_processor_vit.py
@@ -56,7 +56,7 @@ def __init__(
         image_mean=[0.485, 0.456, 0.406],
         image_std=[0.5, 0.5, 0.5],
         padding_value=0.0,
-        return_pixel_mask=True,
+        return_attention_mask=True,
         do_normalize=True,
         do_resize=True,
         size=18,
@@ -70,7 +70,7 @@ def __init__(
         self.image_mean = image_mean
         self.image_std = image_std
         self.padding_value = padding_value
-        self.return_pixel_mask = return_pixel_mask
+        self.return_attention_mask = return_attention_mask
         self.do_normalize = do_normalize
         self.do_resize = do_resize
         self.size = size
@@ -80,36 +80,15 @@ def prepare_image_processor_dict(self):
             "image_mean": self.image_mean,
             "image_std": self.image_std,
             "padding_value": self.padding_value,
-            "return_pixel_mask": self.return_pixel_mask,
+            "return_attention_mask": self.return_attention_mask,
             "do_normalize": self.do_normalize,
             "do_resize": self.do_resize,
             "size": self.size,
         }
 
-    def prepare_numpy_inputs_for_common(self, equal_resolution=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_resolution:
-            image_inputs = floats_list((self.batch_size, self.num_channels, self.image_size, self.image_size))
-        else:
-            image_inputs = [
-                _flatten(floats_list((x, self.feature_size)))
-                for x in range(self.min_resolution, self.max_resolution, self.seq_length_diff)
-            ]
-
-        image_inputs = [np.asarray(x) for x in image_inputs]
-
-        return image_inputs
-
-    def prepare_pytorch_inputs_for_common(self, equal_resolution=False):
-
-        if equal_resolution:
-            input_size = (self.num_channels, self.image_size, self.image_size)
-            image_inputs = torch.randn((self.batch_size, *input_size))
-        
-        else:
-            
+    def prepare_inputs_for_common(self):
+        input_size = (self.num_channels, self.image_size, self.image_size)
+        image_inputs = torch.randn((self.batch_size, *input_size))
 
         return image_inputs
 

From d3607b45264839d9f02f96f85a9f4d24bf2fc4d9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 4 Mar 2021 11:36:02 +0100
Subject: [PATCH 11/44] Improve tests

---
 src/transformers/image_processor_utils.py     |   5 +-
 src/transformers/models/auto/modeling_auto.py |  11 +
 .../models/vit/image_processor_vit.py         |   1 -
 src/transformers/models/vit/modeling_vit.py   |   2 +-
 tests/test_image_processor_common.py          |  26 +-
 tests/test_image_processor_vit.py             |  28 +-
 tests/test_modeling_vit.py                    | 393 ++++--------------
 7 files changed, 102 insertions(+), 364 deletions(-)

diff --git a/src/transformers/image_processor_utils.py b/src/transformers/image_processor_utils.py
index 8e454a1d30edc4..074e2c71d73040 100644
--- a/src/transformers/image_processor_utils.py
+++ b/src/transformers/image_processor_utils.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """
  Image processor common class for python image processors.
+
+ Based on https://github.com/huggingface/transformers/blob/master/src/transformers/feature_extraction_utils.py,
+ but PreTrainedFeatureExtractor -> PreTrainedImageProcessor, BatchFeature -> BatchImages, and so on. 
 """
 import copy
 import json
@@ -55,7 +58,7 @@
 
 class BatchImages(UserDict):
     r"""
-    Holds the output of the :meth:`~transformers.ImageProcessor.pad` and image processor specific ``__call__`` methods.
+    Holds the output of the :meth:`~transformers.PreTrainedImageProcessor.pad` and image processor specific ``__call__`` methods.
 
     This class is derived from a python dictionary and can be used as a dictionary.
 
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d81d652b1d417e..d2a264610d4412 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -23,6 +23,10 @@
 from ...utils import logging
 
 # Add modeling imports here
+from ..vit.modeling_vit import (
+    ViTForImageClassification,
+    ViTModel,
+)
 from ..albert.modeling_albert import (
     AlbertForMaskedLM,
     AlbertForMultipleChoice,
@@ -476,6 +480,13 @@
     ]
 )
 
+MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = OrderedDict(
+    [
+        # Model for Image Classification mapping
+        (ViTConfig, ViTForImageClassification),
+    ]
+)
+
 MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
     [
         # Model for Masked LM mapping
diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index ed03ab6a801f4b..30f94e4eb6fe8e 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -18,7 +18,6 @@
 import PIL
 import torch
 import torchvision
-import torchvision.transforms.functional as F
 from torchvision import transforms as T
 
 from typing import Optional, Union, List
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index cd26eecf426432..857c502aff63d0 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -56,7 +56,7 @@
 _TOKENIZER_FOR_DOC = "ViTTokenizer"
 
 VIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "vit-base-patch16-224",
+    "nielsr/vit-base-patch16-224",
     # See all ViT models at https://huggingface.co/models?filter=vit
 ]
 
diff --git a/tests/test_image_processor_common.py b/tests/test_image_processor_common.py
index f5f694176f932e..c034c85f34abd0 100644
--- a/tests/test_image_processor_common.py
+++ b/tests/test_image_processor_common.py
@@ -69,8 +69,8 @@ def test_init_without_params(self):
         image_processor = self.image_processor_class()
         self.assertIsNotNone(image_processor)
 
-    def test_batch_images(self):
-        image_inputs = self.image_processor_tester.prepare_inputs_for_common()
+    def test_batch_images_numpy(self):
+        image_inputs = self.image_processor_tester.prepare_inputs_numpy_for_common()
         image_processor = self.image_processor_class(**self.image_processor_dict)
         input_name = image_processor.model_input_names[0]
 
@@ -78,7 +78,7 @@ def test_batch_images(self):
 
         self.assertTrue(all(len(x) == len(y) for x, y in zip(image_inputs, processed_images[input_name])))
 
-        image_inputs = self.image_processor_tester.prepare_inputs_for_common(equal_length=True)
+        image_inputs = self.image_processor_tester.prepare_inputs_numpy_for_common(equal_resolution=True)
         processed_images = BatchImages({input_name: image_inputs}, tensor_type="np")
 
         batch_images_input = processed_images[input_name]
@@ -93,7 +93,7 @@ def test_batch_images(self):
 
     @require_torch
     def test_batch_images_pt(self):
-        image_inputs = self.image_processor_tester.prepare_inputs_for_common(equal_length=True)
+        image_inputs = self.image_processor_tester.prepare_inputs_pytorch_for_common(equal_length=True)
         image_processor = self.image_processor_class(**self.image_processor_dict)
         input_name = image_processor.model_input_names[0]
 
@@ -111,21 +111,7 @@ def test_batch_images_pt(self):
 
     @require_tf
     def test_batch_images_tf(self):
-        image_inputs = self.image_processor_tester.prepare_inputs_for_common(equal_length=True)
-        image_processor = self.image_processor_class(**self.image_processor_dict)
-        input_name = image_processor.model_input_names[0]
-
-        processed_images = BatchImages({input_name: image_inputs}, tensor_type="tf")
-
-        batch_images_input = processed_images[input_name]
-
-        if len(batch_images_input.shape) < 3:
-            batch_images_input = batch_images_input[:, :, None]
-
-        # self.assertTrue(
-        #     batch_images_input.shape
-        #     == (self.image_processor_tester.batch_size, len(image_inputs[0]), self.image_processor_tester.feature_size)
-        # )
+        pass
 
     def _check_padding(self, numpify=False):
         pass
@@ -148,7 +134,7 @@ def test_attention_mask(self):
         feat_dict = self.image_processor_dict
         feat_dict["return_attention_mask"] = True
         image_processor = self.image_processor_class(**feat_dict)
-        image_inputs = self.image_processor_tester.prepare_inputs_for_common()
+        image_inputs = self.image_processor_tester.prepare_inputs_pytorch_for_common()
         input_lenghts = [len(x) for x in image_inputs]
         input_name = image_processor.model_input_names[0]
 
diff --git a/tests/test_image_processor_vit.py b/tests/test_image_processor_vit.py
index 5795a28cdbee10..8919b422c9ecc3 100644
--- a/tests/test_image_processor_vit.py
+++ b/tests/test_image_processor_vit.py
@@ -25,23 +25,7 @@
 from transformers.testing_utils import slow
 
 from .test_image_processor_common import ImageProcessorMixin
-
-
-global_rng = random.Random()
-
-
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
+from .test_modeling_common import floats_tensor
 
 
 class ViTImageProcessorTester(unittest.TestCase):
@@ -86,7 +70,13 @@ def prepare_image_processor_dict(self):
             "size": self.size,
         }
 
-    def prepare_inputs_for_common(self):
+    def prepare_inputs_numpy_for_common(self, equal_resolution=False):
+        input_size = (self.num_channels, self.image_size, self.image_size)
+        image_inputs = torch.randn((self.batch_size, *input_size))
+
+        return image_inputs
+    
+    def prepare_inputs_pytorch_for_common(self, equal_resolution=False):
         input_size = (self.num_channels, self.image_size, self.image_size)
         image_inputs = torch.randn((self.batch_size, *input_size))
 
@@ -122,7 +112,7 @@ def test_call_pytorch(self):
         # Initialize image_processor
         image_processor = self.image_processor_class(**self.image_processor_tester.prepare_image_processor_dict())
         # create three inputs of resolution 800, 1000, and 1200
-        image_inputs = None
+        image_inputs = floats_tensor()
 
         # Test not batched input
         encoded_images_1 = image_processor(image_inputs[0], return_tensors="pt").input_values
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 8216318be2447f..a4d26693813547 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -17,12 +17,15 @@
 
 import unittest
 
-from tests.test_modeling_common import floats_tensor
+from PIL import Image
+import requests
+import torchvision.transforms as T
+
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask, floats_tensor
 
 
 if is_torch_available():
@@ -30,12 +33,7 @@
 
     from transformers import (
         ViTConfig,
-        ViTForCausalLM,
-        ViTForMaskedLM,
-        ViTForMultipleChoice,
-        ViTForQuestionAnswering,
-        ViTForSequenceClassification,
-        ViTForTokenClassification,
+        ViTForImageClassification,
         ViTModel,
     )
     from transformers.models.vit.modeling_vit import (
@@ -48,12 +46,12 @@ def __init__(
             self,
             parent,
             batch_size=13,
-            seq_length=7,
+            image_size=30,
+            patch_size=2,
+            num_channels=3,
             is_training=True,
             use_input_mask=True,
-            use_token_type_ids=True,
             use_labels=True,
-            vocab_size=99,
             hidden_size=32,
             num_hidden_layers=5,
             num_attention_heads=4,
@@ -61,22 +59,19 @@ def __init__(
             hidden_act="gelu",
             hidden_dropout_prob=0.1,
             attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
+            type_sequence_label_size=10,
             initializer_range=0.02,
             num_labels=3,
-            num_choices=4,
             scope=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
-        self.seq_length = seq_length
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
         self.is_training = is_training
         self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
         self.use_labels = use_labels
-        self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
@@ -84,35 +79,26 @@ def __init__(
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.num_labels = num_labels
-        self.num_choices = num_choices
         self.scope = scope
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            input_mask = random_attention_mask([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
+        image_labels = None
         if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            image_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
 
         config = ViTConfig(
-            vocab_size=self.vocab_size,
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
             hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
@@ -120,239 +106,42 @@ def prepare_config_and_inputs(self):
             hidden_act=self.hidden_act,
             hidden_dropout_prob=self.hidden_dropout_prob,
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
             is_decoder=False,
             initializer_range=self.initializer_range,
         )
 
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
+        return config, pixel_values, input_mask, image_labels
 
     def create_and_check_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+            self, config, pixel_values, input_mask, image_labels
     ):
         model = ViTModel(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = ViTModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(pixel_values, attention_mask=input_mask)
+        result = model(pixel_values)
+        result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_causal_lm(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-    ):
-        model = ViTForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ViTForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = ViTForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ViTForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def create_and_check_for_image_classification(
+            self, config, pixel_values, input_mask, image_labels
     ):
         config.num_labels = self.num_labels
-        model = ViTForSequenceClassification(config)
+        model = ViTForImageClassification(config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        result = model(pixel_values, attention_mask=input_mask, labels=image_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
-    def create_and_check_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ViTForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = ViTForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
             config,
-            input_ids,
-            token_type_ids,
+            pixel_values,
             input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
+            image_labels,
         ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        inputs_dict = {"pixel_values": pixel_values, "attention_mask": input_mask}
         return config, inputs_dict
 
 
@@ -362,17 +151,16 @@ class ViTModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ViTModel,
-            ViTForMaskedLM,
-            ViTForCausalLM,
-            ViTForMultipleChoice,
-            ViTForQuestionAnswering,
-            ViTForSequenceClassification,
-            ViTForTokenClassification,
+            ViTForImageClassification,
         )
         if is_torch_available()
         else ()
     )
-    all_generative_model_classes = (ViTForCausalLM,) if is_torch_available() else ()
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
 
     def setUp(self):
         self.model_tester = ViTModelTester(self)
@@ -385,67 +173,9 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
+    def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
     @slow
     def test_model_from_pretrained(self):
@@ -454,25 +184,44 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(model)
 
 
+# We will verify our results on an image of cute cats
+# TODO: use VitImageProcessor in the future
+def prepare_img(image_resolution):
+    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    # standard PyTorch mean-std input image normalization
+    transform = T.Compose([
+        T.Resize((image_resolution,image_resolution)),
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    # mean-std normalize the input image (batch-size: 1)
+    img = transform(im).unsqueeze(0)
+
+    return img
+
+
 @require_torch
 class ViTModelIntegrationTest(unittest.TestCase):
     @slow
-    def test_inference_masked_lm(self):
-        model = ViTForMaskedLM.from_pretrained("google/vit_small_patch16_224")
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
+    def test_inference_image_classification_head(self):
+        # TODO: replace namespace to google
+        model = ViTForImageClassification.from_pretrained("nielsr/vit-base-patch16-224").to(torch_device)
+        pixel_values = prepare_img(224).to(torch_device)
 
-        # TODO Replace vocab size
-        vocab_size = 32000
+        # forward pass
+        outputs = model(pixel_values)
 
-        expected_shape = torch.Size((1, 6, vocab_size))
-        self.assertEqual(output.shape, expected_shape)
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
 
-        # TODO Replace values below with what was printed above.
         expected_slice = torch.tensor(
-            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
-        )
+            [-0.7332,  0.7286, -0.4020]
+        ).to(torch_device)
 
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
 
 

From dca36be0518213efb8735b18cc339119cb6dabce Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 4 Mar 2021 17:50:42 +0100
Subject: [PATCH 12/44] Small cleanup

---
 src/transformers/models/vit/__init__.py     |  4 +---
 src/transformers/models/vit/modeling_vit.py | 12 ++++++------
 tests/test_image_processor_common.py        |  1 +
 tests/test_modeling_vit.py                  |  5 +----
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
index 3aaf327aebc929..69e397120ff411 100644
--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@
         "ViTLayer",
         "ViTModel",
         "ViTPreTrainedModel",
-        "load_tf_weights_in_vit",
     ]
 
 
@@ -44,7 +43,6 @@
             ViTLayer,
             ViTModel,
             ViTPreTrainedModel,
-            load_tf_weights_in_vit,
         )
 
 
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 857c502aff63d0..1d87f23c3a4f8e 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -32,12 +32,7 @@
 )
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    MaskedLMOutput,
-    MultipleChoiceModelOutput,
-    QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
-    TokenClassifierOutput,
 )
 from ...modeling_utils import (
     PreTrainedModel,
@@ -83,7 +78,12 @@ def parse(x):
 
 
 class ViTEmbeddings(nn.Module):
-    """Construct the cls token, position and patch embeddings."""
+    """Construct the cls token, position and patch embeddings.
+    
+    Based on timm implementation, which can be found here: 
+    https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    
+    """
 
     def __init__(self, config):
         super().__init__()
diff --git a/tests/test_image_processor_common.py b/tests/test_image_processor_common.py
index c034c85f34abd0..397c9092496202 100644
--- a/tests/test_image_processor_common.py
+++ b/tests/test_image_processor_common.py
@@ -70,6 +70,7 @@ def test_init_without_params(self):
         self.assertIsNotNone(image_processor)
 
     def test_batch_images_numpy(self):
+        # Define the images + initialize image_processor
         image_inputs = self.image_processor_tester.prepare_inputs_numpy_for_common()
         image_processor = self.image_processor_class(**self.image_processor_dict)
         input_name = image_processor.model_input_names[0]
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index a4d26693813547..a3e6d917c649e9 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -120,7 +120,6 @@ def create_and_check_model(
         model.eval()
         result = model(pixel_values, attention_mask=input_mask)
         result = model(pixel_values)
-        result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def create_and_check_for_image_classification(
@@ -222,6 +221,4 @@ def test_inference_image_classification_head(self):
             [-0.7332,  0.7286, -0.4020]
         ).to(torch_device)
 
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
\ No newline at end of file

From 9f403520a4f1b96020a04936e73e84899eb5c11b Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 4 Mar 2021 18:20:09 +0100
Subject: [PATCH 13/44] Remove is_decoder logic and make style

---
 src/transformers/__init__.py                  |  19 +-
 src/transformers/image_processor_utils.py     |  19 +-
 src/transformers/models/__init__.py           |   2 +-
 .../models/auto/configuration_auto.py         |   2 +-
 src/transformers/models/auto/modeling_auto.py |   8 +-
 src/transformers/models/vit/__init__.py       |   5 +-
 .../models/vit/configuration_vit.py           |   9 +-
 .../models/vit/convert_vit_timm_to_pytorch.py | 155 +++++---
 .../models/vit/image_processor_vit.py         |   8 +-
 src/transformers/models/vit/modeling_vit.py   | 368 +++---------------
 tests/test_image_processor_common.py          |   2 +-
 tests/test_image_processor_vit.py             |   6 +-
 tests/test_modeling_vit.py                    |  84 ++--
 13 files changed, 219 insertions(+), 468 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f132dff66198bc..907d5cbb31c4c3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1343,7 +1343,6 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
-    from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig, ViTImageProcessor
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
@@ -1424,6 +1423,7 @@
         TransfoXLCorpus,
         TransfoXLTokenizer,
     )
+    from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig, ViTImageProcessor
     from .models.wav2vec2 import (
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Wav2Vec2Config,
@@ -1547,15 +1547,6 @@
     # Modeling
     if is_torch_available():
 
-        from .models.vit import (
-            VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ViTForImageClassification,
-            ViTLayer,
-            ViTModel,
-            ViTPreTrainedModel,
-            load_tf_weights_in_vit,
-        )
-
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
@@ -1950,6 +1941,14 @@
             TransfoXLPreTrainedModel,
             load_tf_weights_in_transfo_xl,
         )
+        from .models.vit import (
+            VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTForImageClassification,
+            ViTLayer,
+            ViTModel,
+            ViTPreTrainedModel,
+            load_tf_weights_in_vit,
+        )
         from .models.wav2vec2 import (
             WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
             Wav2Vec2ForCTC,
diff --git a/src/transformers/image_processor_utils.py b/src/transformers/image_processor_utils.py
index 074e2c71d73040..170adb2e811e91 100644
--- a/src/transformers/image_processor_utils.py
+++ b/src/transformers/image_processor_utils.py
@@ -15,8 +15,8 @@
 """
  Image processor common class for python image processors.
 
- Based on https://github.com/huggingface/transformers/blob/master/src/transformers/feature_extraction_utils.py,
- but PreTrainedFeatureExtractor -> PreTrainedImageProcessor, BatchFeature -> BatchImages, and so on. 
+ Based on https://github.com/huggingface/transformers/blob/master/src/transformers/feature_extraction_utils.py, but
+ PreTrainedFeatureExtractor -> PreTrainedImageProcessor, BatchFeature -> BatchImages, and so on.
 """
 import copy
 import json
@@ -58,7 +58,8 @@
 
 class BatchImages(UserDict):
     r"""
-    Holds the output of the :meth:`~transformers.PreTrainedImageProcessor.pad` and image processor specific ``__call__`` methods.
+    Holds the output of the :meth:`~transformers.PreTrainedImageProcessor.pad` and image processor specific
+    ``__call__`` methods.
 
     This class is derived from a python dictionary and can be used as a dictionary.
 
@@ -78,8 +79,8 @@ def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[Non
 
     def __getitem__(self, item: str) -> Union[Any]:
         """
-        If the key is a string, returns the value of the dict associated to :obj:`key` ('pixel_values',
-        'pixel_mask', etc.).
+        If the key is a string, returns the value of the dict associated to :obj:`key` ('pixel_values', 'pixel_mask',
+        etc.).
         """
         if isinstance(item, str):
             return self.data[item]
@@ -559,8 +560,8 @@ def pad(
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
             return_pixel_mask (:obj:`bool`, `optional`):
-                Whether to return the pixel mask. If left to the default, will return the pixel mask according
-                to the specific image_processor's default.
+                Whether to return the pixel mask. If left to the default, will return the pixel mask according to the
+                specific image_processor's default.
 
                 `What are pixel masks? <../glossary.html#attention-mask>`__
             return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
@@ -586,9 +587,7 @@ def pad(
             )
 
         required_input = processed_images[self.model_input_names[0]]
-        return_pixel_mask = (
-            return_pixel_mask if return_pixel_mask is not None else self.return_pixel_mask
-        )
+        return_pixel_mask = return_pixel_mask if return_pixel_mask is not None else self.return_pixel_mask
 
         if not required_input:
             if return_pixel_mask:
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 9dd8101928f16e..16f6b4e2b88d14 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 
 from . import (
-    vit,
     albert,
     auto,
     bart,
@@ -66,6 +65,7 @@
     t5,
     tapas,
     transfo_xl,
+    vit,
     wav2vec2,
     xlm,
     xlm_roberta,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2a266f32c9f6dd..ab6541edfca9af 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -19,7 +19,6 @@
 
 from ...configuration_utils import PretrainedConfig
 from ..albert.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
-from ..vit.configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
 from ..bart.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
 from ..bert.configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
 from ..bert_generation.configuration_bert_generation import BertGenerationConfig
@@ -67,6 +66,7 @@
 from ..t5.configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
 from ..tapas.configuration_tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig
 from ..transfo_xl.configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
+from ..vit.configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
 from ..wav2vec2.configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config
 from ..xlm.configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
 from ..xlm_prophetnet.configuration_xlm_prophetnet import (
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d2a264610d4412..d0dca675ae7e86 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -21,12 +21,6 @@
 from ...configuration_utils import PretrainedConfig
 from ...file_utils import add_start_docstrings
 from ...utils import logging
-
-# Add modeling imports here
-from ..vit.modeling_vit import (
-    ViTForImageClassification,
-    ViTModel,
-)
 from ..albert.modeling_albert import (
     AlbertForMaskedLM,
     AlbertForMultipleChoice,
@@ -266,7 +260,6 @@
     XLNetModel,
 )
 from .configuration_auto import (
-    ViTConfig,
     AlbertConfig,
     AutoConfig,
     BartConfig,
@@ -309,6 +302,7 @@
     T5Config,
     TapasConfig,
     TransfoXLConfig,
+    ViTConfig,
     Wav2Vec2Config,
     XLMConfig,
     XLMProphetNetConfig,
diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
index 69e397120ff411..69f30c7ae8a124 100644
--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -16,7 +16,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import TYPE_CHECKING
-from ...file_utils import _BaseLazyModule, is_torch_available, is_tokenizers_available
+
+from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available
+
+
 _import_structure = {
     "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
     "image_processor_vit": ["ViTImageProcessor"],
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index a787baa084f931..85bc8997f50d98 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -96,6 +96,7 @@ class ViTConfig(PretrainedConfig):
         >>> configuration = model.config
     """
     model_type = "vit"
+
     def __init__(
         self,
         vocab_size=30522,
@@ -120,12 +121,7 @@ def __init__(
         num_channels=3,
         **kwargs
     ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs
-        )
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -144,4 +140,3 @@ def __init__(
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
-        
\ No newline at end of file
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index 37443c9765ba89..04be8fd53fcd80 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -20,23 +20,17 @@
 from pathlib import Path
 
 import torch
+import torchvision.transforms as T
 from packaging import version
+from PIL import Image
 from torch import nn
-import timm 
 
-from PIL import Image
 import requests
-import torchvision.transforms as T
-
-from transformers import (
-    ViTConfig,
-    ViTModel,
-    ViTForImageClassification,
-)
+import timm
+from transformers import ViTConfig, ViTForImageClassification, ViTModel
 from transformers.utils import logging
 
 
-
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
 
@@ -44,32 +38,61 @@
 # here we list all keys to be renamed (original name on the left, our name on the right)
 def create_rename_keys(config, base_model=False):
     rename_keys = []
-    for i in range(config.num_hidden_layers):  
+    for i in range(config.num_hidden_layers):
         # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(("blocks." + str(i) + ".norm1.weight", "vit.encoder.layer." + str(i) + ".layernorm_before.weight"))
-        rename_keys.append(("blocks." + str(i) + ".norm1.bias", "vit.encoder.layer." + str(i) + ".layernorm_before.bias"))
-        rename_keys.append(("blocks." + str(i) + ".attn.proj.weight", "vit.encoder.layer." + str(i) + ".attention.output.dense.weight"))
-        rename_keys.append(("blocks." + str(i) + ".attn.proj.bias", "vit.encoder.layer." + str(i) + ".attention.output.dense.bias"))
-        rename_keys.append(("blocks." + str(i) + ".norm2.weight", "vit.encoder.layer." + str(i) + ".layernorm_after.weight"))
-        rename_keys.append(("blocks." + str(i) + ".norm2.bias", "vit.encoder.layer." + str(i) + ".layernorm_after.bias"))
-        rename_keys.append(("blocks." + str(i) + ".mlp.fc1.weight", "vit.encoder.layer." + str(i) + ".intermediate.dense.weight"))
-        rename_keys.append(("blocks." + str(i) + ".mlp.fc1.bias", "vit.encoder.layer." + str(i) + ".intermediate.dense.bias"))
-        rename_keys.append(("blocks." + str(i) + ".mlp.fc2.weight", "vit.encoder.layer." + str(i) + ".output.dense.weight"))
-        rename_keys.append(("blocks." + str(i) + ".mlp.fc2.bias", "vit.encoder.layer." + str(i) + ".output.dense.bias"))
-        
-    # projection layer + position embeddings 
-    rename_keys.extend([("cls_token", "vit.embeddings.cls_token"),
-    ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
-    ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
-    ("pos_embed", "vit.embeddings.position_embeddings"),
-    ])
+        rename_keys.append(
+            ("blocks." + str(i) + ".norm1.weight", "vit.encoder.layer." + str(i) + ".layernorm_before.weight")
+        )
+        rename_keys.append(
+            ("blocks." + str(i) + ".norm1.bias", "vit.encoder.layer." + str(i) + ".layernorm_before.bias")
+        )
+        rename_keys.append(
+            (
+                "blocks." + str(i) + ".attn.proj.weight",
+                "vit.encoder.layer." + str(i) + ".attention.output.dense.weight",
+            )
+        )
+        rename_keys.append(
+            ("blocks." + str(i) + ".attn.proj.bias", "vit.encoder.layer." + str(i) + ".attention.output.dense.bias")
+        )
+        rename_keys.append(
+            ("blocks." + str(i) + ".norm2.weight", "vit.encoder.layer." + str(i) + ".layernorm_after.weight")
+        )
+        rename_keys.append(
+            ("blocks." + str(i) + ".norm2.bias", "vit.encoder.layer." + str(i) + ".layernorm_after.bias")
+        )
+        rename_keys.append(
+            ("blocks." + str(i) + ".mlp.fc1.weight", "vit.encoder.layer." + str(i) + ".intermediate.dense.weight")
+        )
+        rename_keys.append(
+            ("blocks." + str(i) + ".mlp.fc1.bias", "vit.encoder.layer." + str(i) + ".intermediate.dense.bias")
+        )
+        rename_keys.append(
+            ("blocks." + str(i) + ".mlp.fc2.weight", "vit.encoder.layer." + str(i) + ".output.dense.weight")
+        )
+        rename_keys.append(
+            ("blocks." + str(i) + ".mlp.fc2.bias", "vit.encoder.layer." + str(i) + ".output.dense.bias")
+        )
+
+    # projection layer + position embeddings
+    rename_keys.extend(
+        [
+            ("cls_token", "vit.embeddings.cls_token"),
+            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
+            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
+            ("pos_embed", "vit.embeddings.position_embeddings"),
+        ]
+    )
 
     # classification head
-    rename_keys.extend([("head.weight", "classifier.weight"),
-    ("head.bias", "classifier.bias"),
-    ("norm.weight", "layernorm.weight"),
-    ("norm.bias", "layernorm.bias"),
-    ])
+    rename_keys.extend(
+        [
+            ("head.weight", "classifier.weight"),
+            ("head.bias", "classifier.bias"),
+            ("norm.weight", "layernorm.weight"),
+            ("norm.bias", "layernorm.bias"),
+        ]
+    )
 
     # to do: add base model support
     # if just the base model, we should remove "vit" from all keys
@@ -86,13 +109,21 @@ def read_in_q_k_v(state_dict, config, base_model=False):
         in_proj_weight = state_dict.pop("blocks." + str(i) + ".attn.qkv.weight")
         in_proj_bias = state_dict.pop("blocks." + str(i) + ".attn.qkv.bias")
         # next, add query, keys and values (in that order) to the state dict
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.query.weight"] = in_proj_weight[:config.hidden_size, :]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.query.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.key.weight"] = in_proj_weight[config.hidden_size:config.hidden_size*2, :]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.key.bias"] = in_proj_bias[config.hidden_size:config.hidden_size*2]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.value.weight"] = in_proj_weight[-config.hidden_size:, :]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.value.bias"] = in_proj_bias[-config.hidden_size:]
-    
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.query.weight"] = in_proj_weight[
+            : config.hidden_size, :
+        ]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.value.weight"] = in_proj_weight[
+            -config.hidden_size :, :
+        ]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.self.value.bias"] = in_proj_bias[-config.hidden_size :]
+
     # to do: add base model support
     if base_model:
         pass
@@ -116,15 +147,13 @@ def rename_key(dct, old, new):
 
 # We will verify our results on an image of cute cats
 def prepare_img(image_resolution):
-    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
 
     # standard PyTorch mean-std input image normalization
-    transform = T.Compose([
-        T.Resize((image_resolution,image_resolution)),
-        T.ToTensor(),
-        T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
-    ])
+    transform = T.Compose(
+        [T.Resize((image_resolution, image_resolution)), T.ToTensor(), T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])]
+    )
 
     # mean-std normalize the input image (batch-size: 1)
     img = transform(im).unsqueeze(0)
@@ -137,26 +166,26 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     """
     Copy/paste/tweak model's weights to our ViT structure.
     """
-        
+
     # define HuggingFace configuration
     config = ViTConfig()
     if vit_name == "vit_base_patch16_224":
-        config.num_labels=1000
+        config.num_labels = 1000
     elif vit_name == "vit_base_patch32_224":
         config.patch_size = 32
-        config.num_labels=1000
+        config.num_labels = 1000
     elif vit_name == "vit_base_patch16_384":
         config.image_size = 384
-        config.num_labels=1000
+        config.num_labels = 1000
     elif vit_name == "vit_base_patch32_384":
         config.image_size = 384
         config.patch_size = 32
-        config.num_labels=1000
+        config.num_labels = 1000
 
     # load original model from timm
     vit = timm.create_model(vit_name, pretrained=True)
     vit.eval()
-    
+
     # load state_dict of original model, remove and rename some keys
     state_dict = vit.state_dict()
     rename_keys = create_rename_keys(config, base_model)
@@ -165,7 +194,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     read_in_q_k_v(state_dict, config, base_model)
     if base_model:
         remove_classification_head_(state_dict)
-    
+
     model = ViTForImageClassification(config).eval()
     model.load_state_dict(state_dict)
 
@@ -176,7 +205,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
 
     assert logits.shape == outputs.logits.shape
     assert torch.allclose(logits, outputs.logits, atol=1e-4)
-    
+
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
@@ -186,9 +215,19 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
-        "--vit_name", default='vit_base_patch16_224', type=str, help="Name of the ViT timm model you'd like to convert, currently supports ViT base models."
+        "--vit_name",
+        default="vit_base_patch16_224",
+        type=str,
+        help="Name of the ViT timm model you'd like to convert, currently supports ViT base models.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--base_model",
+        default=False,
+        action="store_true",
+        help="Whether to just load the base model without any head.",
     )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory.")
-    parser.add_argument("--base_model", default=False, action="store_true", help="Whether to just load the base model without any head.")
     args = parser.parse_args()
-    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
\ No newline at end of file
+    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
index 30f94e4eb6fe8e..102dc7d302afc1 100644
--- a/src/transformers/models/vit/image_processor_vit.py
+++ b/src/transformers/models/vit/image_processor_vit.py
@@ -14,14 +14,14 @@
 # limitations under the License.
 """Image processor class for ViT."""
 
+from typing import List, Optional, Union
+
 import numpy as np
 import PIL
 import torch
 import torchvision
 from torchvision import transforms as T
 
-from typing import Optional, Union, List
-
 from ...file_utils import PaddingStrategy, TensorType
 from ...image_processor_utils import BatchImages, PreTrainedImageProcessor
 from ...utils import logging
@@ -237,7 +237,7 @@ def __call__(
         # step 3: apply transformations to images
         transformed_images = [transforms(image) for image in images]
 
-        # step 4: TO DO: replace by self.pad (which is defined in image_processor_utils.py), which should 
+        # step 4: TO DO: replace by self.pad (which is defined in image_processor_utils.py), which should
         # take care of padding, creation of attention mask, return_tensors type
         samples = nested_tensor_from_tensor_list(transformed_images)
 
@@ -246,4 +246,4 @@ def __call__(
 
         encoded_inputs = BatchImages(data=data)
 
-        return encoded_inputs
\ No newline at end of file
+        return encoded_inputs
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 1d87f23c3a4f8e..1eafcc8daf9523 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -30,10 +30,7 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    SequenceClassifierOutput,
-)
+from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
 from ...modeling_utils import (
     PreTrainedModel,
     SequenceSummary,
@@ -59,11 +56,11 @@
 """ Layer/Module Helpers
 Hacked together by / Copyright 2020 Ross Wightman
 """
-from itertools import repeat
 import collections.abc
+from itertools import repeat
 
 
-# Copied from 
+# Copied from
 # https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
 # From PyTorch internals
 def _ntuple(n):
@@ -71,6 +68,7 @@ def parse(x):
         if isinstance(x, collections.abc.Iterable):
             return x
         return tuple(repeat(x, n))
+
     return parse
 
 
@@ -79,10 +77,10 @@ def parse(x):
 
 class ViTEmbeddings(nn.Module):
     """Construct the cls token, position and patch embeddings.
-    
-    Based on timm implementation, which can be found here: 
+
+    Based on timm implementation, which can be found here:
     https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-    
+
     """
 
     def __init__(self, config):
@@ -90,8 +88,11 @@ def __init__(self, config):
 
         self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.patch_embeddings = PatchEmbeddings(
-                image_size=config.image_size, patch_size=config.patch_size, num_channels=config.num_channels, 
-                embed_dim=config.hidden_size)
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -109,6 +110,7 @@ def forward(self, pixel_values):
 
 class PatchEmbeddings(nn.Module):
     """ Image to Patch Embedding."""
+
     def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768):
         super().__init__()
         image_size = to_2tuple(image_size)
@@ -123,8 +125,9 @@ def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768)
     def forward(self, x):
         B, C, H, W = x.shape
         # FIXME look at relaxing size constraints
-        assert H == self.image_size[0] and W == self.image_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+        assert (
+            H == self.image_size[0] and W == self.image_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
         x = self.projection(x).flatten(2).transpose(1, 2)
         return x
 
@@ -147,12 +150,6 @@ def __init__(self, config):
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
 
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
@@ -164,72 +161,22 @@ def forward(
         hidden_states,
         attention_mask=None,
         head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
         output_attentions=False,
     ):
         mixed_query_layer = self.query(hidden_states)
 
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
         query_layer = self.transpose_for_scores(mixed_query_layer)
 
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-        
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 
-        # if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-        #     seq_length = hidden_states.size()[1]
-        #     position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-        #     position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-        #     distance = position_ids_l - position_ids_r
-        #     positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-        #     positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-
-        #     if self.position_embedding_type == "relative_key":
-        #         relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-        #         attention_scores = attention_scores + relative_position_scores
-        #     elif self.position_embedding_type == "relative_key_query":
-        #         relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-        #         relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-        #         attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in ViTModel forward() function)
             attention_scores = attention_scores + attention_mask
-        
+
         # Normalize the attention scores to probabilities.
         attention_probs = nn.Softmax(dim=-1)(attention_scores)
 
@@ -249,28 +196,23 @@ def forward(
 
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        if self.is_decoder:
-            outputs = outputs + (past_key_value,)
         return outputs
 
 
 class ViTSelfOutput(nn.Module):
-    """ The residual connection is defined in VitLayer instead of here (as is the case with our models),
-        due to the layernorm applied before each block. """
-    
+    """The residual connection is defined in VitLayer instead of here (as is the case with other models),
+    due to the layernorm applied before each block."""
+
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
-        
+
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
-        # first residual connection
-        #hidden_states = hidden_states + input_tensor
-
         return hidden_states
 
 
@@ -304,18 +246,12 @@ def forward(
         hidden_states,
         attention_mask=None,
         head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
         output_attentions=False,
     ):
         self_outputs = self.self(
             hidden_states,
             attention_mask,
             head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
             output_attentions,
         )
 
@@ -335,7 +271,7 @@ def __init__(self, config):
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states):
-        
+
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -359,17 +295,12 @@ def forward(self, hidden_states, input_tensor):
 
 class ViTLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
-    
+
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
         self.attention = ViTAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
-            self.crossattention = ViTAttention(config)
         self.intermediate = ViTIntermediate(config)
         self.output = ViTOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -380,60 +311,23 @@ def forward(
         hidden_states,
         attention_mask=None,
         head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
         output_attentions=False,
     ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None    
-        
         self_attention_outputs = self.attention(
-            self.layernorm_before(hidden_states), # in ViT, layernorm is applied before self-attention
+            self.layernorm_before(hidden_states),  # in ViT, layernorm is applied before self-attention
             attention_mask,
             head_mask,
             output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
         )
         attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            assert hasattr(
-                self, "crossattention"
-            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
         # first residual connection
         hidden_states = attention_output + hidden_states
-        
+
         # in ViT, layernorm is also applied after self-attention
-        layer_output = self.layernorm_after(hidden_states)   
-        
+        layer_output = self.layernorm_after(hidden_states)
+
         # feedforward chunking not working for now
         # layer_output = apply_chunking_to_forward(
         #     self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, layer_output
@@ -443,12 +337,8 @@ def forward(
 
         # second residual connection is done here
         layer_output = self.output(layer_output, hidden_states)
-        
-        outputs = (layer_output,) + outputs
 
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value,)
+        outputs = (layer_output,) + outputs
 
         return outputs
 
@@ -469,35 +359,21 @@ def forward(
         hidden_states,
         attention_mask=None,
         head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
         output_attentions=False,
         output_hidden_states=False,
         return_dict=True,
     ):
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
 
-        next_decoder_cache = () if use_cache else None
         for i, layer_module in enumerate(self.layer):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
-                if use_cache:
-                    logger.warn(
-                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
-                        "`use_cache=False`..."
-                    )
-                    use_cache = False
-
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         return module(*inputs, past_key_value, output_attentions)
@@ -509,27 +385,19 @@ def custom_forward(*inputs):
                     hidden_states,
                     attention_mask,
                     layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
                 )
-            else:     
+            else:
                 layer_outputs = layer_module(
                     hidden_states,
                     attention_mask,
                     layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
                     output_attentions,
                 )
 
             hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
+
             if output_attentions:
                 all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
@@ -539,69 +407,18 @@ def custom_forward(*inputs):
                 v
                 for v in [
                     hidden_states,
-                    next_decoder_cache,
                     all_hidden_states,
                     all_self_attentions,
-                    all_cross_attentions,
                 ]
                 if v is not None
             )
-        return BaseModelOutputWithPastAndCrossAttentions(
+        return BaseModelOutput(
             last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
             attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
         )
 
 
-class ViTPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class ViTLMPredictionHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.transform = ViTPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class ViTOnlyMLMHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = ViTLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
 class ViTPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and
@@ -670,21 +487,6 @@ def _init_weights(self, module):
     VIT_START_DOCSTRING,
 )
 class ViTModel(ViTPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`.
-    To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an
-    :obj:`encoder_hidden_states` is then expected as an input to the forward pass.
-    """
-
     def __init__(self, config):
         super().__init__(config)
         self.config = config
@@ -712,7 +514,7 @@ def _prune_heads(self, heads_to_prune):
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="vit-base-patch16-224",
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        output_type=BaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
@@ -720,44 +522,18 @@ def forward(
         pixel_values=None,
         attention_mask=None,
         head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # if self.config.is_decoder:
-        #     use_cache = use_cache if use_cache is not None else self.config.use_cache
-        # else:
-        #     use_cache = False
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
 
         # if input_ids is not None and inputs_embeds is not None:
         #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -772,30 +548,13 @@ def forward(
 
         # device = input_ids.device if input_ids is not None else inputs_embeds.device
 
-        # # past_key_values_length
-        # past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-
         # if attention_mask is None:
         #     attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-        # if token_type_ids is None:
-        #     token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 
         # # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # # ourselves in which case we just need to make it broadcastable to all heads.
         # extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
 
-        # # If a 2D or 3D attention mask is provided for the cross-attention
-        # # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        # if self.config.is_decoder and encoder_hidden_states is not None:
-        #     encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-        #     encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-        #     if encoder_attention_mask is None:
-        #         encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-        #     encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        # else:
-        #     encoder_extended_attention_mask = None
-
         # # Prepare head mask if needed
         # # 1.0 in head_mask indicate we keep the head
         # # attention_probs has shape bsz x n_heads x N x N
@@ -809,12 +568,8 @@ def forward(
 
         encoder_outputs = self.encoder(
             embedding_output,
-            attention_mask=None, # replaced extended_attention_mask
+            attention_mask=None,  # replaced extended_attention_mask
             head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=None, # replaced encoder_extended_attention_mask
-            past_key_values=past_key_values, 
-            use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -824,36 +579,13 @@ def forward(
         if not return_dict:
             return (sequence_output,) + encoder_outputs[1:]
 
-        return BaseModelOutputWithPastAndCrossAttentions(
+        return BaseModelOutput(
             last_hidden_state=sequence_output,
-            past_key_values=encoder_outputs.past_key_values,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
         )
 
 
-# class ViTClassificationHead(nn.Module):
-#     """Head for image classification tasks."""
-
-#     def __init__(self, config):
-#         super().__init__()
-#         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-#         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-
-#         self.config = config
-
-#     def forward(self, features, **kwargs):
-#         x = features[:, 0, :]  # take [CLS] token 
-#         x = self.dropout(x)
-#         x = self.dense(x)
-#         x = ACT2FN[self.config.hidden_act](x)
-#         x = self.dropout(x)
-#         x = self.out_proj(x)
-#         return x
-
-
 @add_start_docstrings(
     """ViT Model transformer with an image classification head on top (a linear layer on top of
     the pooled output) e.g. for ImageNet. """,
@@ -862,7 +594,7 @@ def forward(
 class ViTForImageClassification(ViTPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        
+
         self.num_labels = config.num_labels
         self.vit = ViTModel(config)
         # Classifier head
@@ -879,18 +611,18 @@ def __init__(self, config):
     #     config_class=_CONFIG_FOR_DOC,
     # )
     def forward(
-            self,
-            pixel_values=None,
-            attention_mask=None,
-            head_mask=None,
-            labels=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None,
+        self,
+        pixel_values=None,
+        attention_mask=None,
+        head_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the image classification/regression loss.
             Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
             If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
             If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
@@ -930,4 +662,4 @@ def forward(
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )
\ No newline at end of file
+        )
diff --git a/tests/test_image_processor_common.py b/tests/test_image_processor_common.py
index 397c9092496202..ceb437fe38a233 100644
--- a/tests/test_image_processor_common.py
+++ b/tests/test_image_processor_common.py
@@ -144,4 +144,4 @@ def test_attention_mask(self):
         processed = image_processor.pad(processed, padding="biggest", return_tensors="np")
         self.assertIn("attention_mask", processed)
         self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
-        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
\ No newline at end of file
+        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
diff --git a/tests/test_image_processor_vit.py b/tests/test_image_processor_vit.py
index 8919b422c9ecc3..0d483423fcd973 100644
--- a/tests/test_image_processor_vit.py
+++ b/tests/test_image_processor_vit.py
@@ -75,7 +75,7 @@ def prepare_inputs_numpy_for_common(self, equal_resolution=False):
         image_inputs = torch.randn((self.batch_size, *input_size))
 
         return image_inputs
-    
+
     def prepare_inputs_pytorch_for_common(self, equal_resolution=False):
         input_size = (self.num_channels, self.image_size, self.image_size)
         image_inputs = torch.randn((self.batch_size, *input_size))
@@ -124,10 +124,10 @@ def test_call_pytorch(self):
         encoded_images_2 = image_processor(np_image_inputs, return_tensors="pt").input_values
         for enc_seq_1, enc_seq_2 in zip(encoded_images_1, encoded_images_2):
             self.assertTrue(torch.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-    
+
     def test_normalization(self):
         pass
 
     @slow
     def test_pretrained_checkpoints_are_set_correctly(self):
-        pass
\ No newline at end of file
+        pass
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index a3e6d917c649e9..a32c1f9868c25f 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -17,52 +17,46 @@
 
 import unittest
 
-from PIL import Image
-import requests
 import torchvision.transforms as T
+from PIL import Image
 
+import requests
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask, floats_tensor
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 
 
 if is_torch_available():
     import torch
 
-    from transformers import (
-        ViTConfig,
-        ViTForImageClassification,
-        ViTModel,
-    )
-    from transformers.models.vit.modeling_vit import (
-        VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
+    from transformers import ViTConfig, ViTForImageClassification, ViTModel
+    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 class ViTModelTester:
     def __init__(
-            self,
-            parent,
-            batch_size=13,
-            image_size=30,
-            patch_size=2,
-            num_channels=3,
-            is_training=True,
-            use_input_mask=True,
-            use_labels=True,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            type_sequence_label_size=10,
-            initializer_range=0.02,
-            num_labels=3,
-            scope=None,
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -112,9 +106,7 @@ def prepare_config_and_inputs(self):
 
         return config, pixel_values, input_mask, image_labels
 
-    def create_and_check_model(
-            self, config, pixel_values, input_mask, image_labels
-    ):
+    def create_and_check_model(self, config, pixel_values, input_mask, image_labels):
         model = ViTModel(config=config)
         model.to(torch_device)
         model.eval()
@@ -122,9 +114,7 @@ def create_and_check_model(
         result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_image_classification(
-            self, config, pixel_values, input_mask, image_labels
-    ):
+    def create_and_check_for_image_classification(self, config, pixel_values, input_mask, image_labels):
         config.num_labels = self.num_labels
         model = ViTForImageClassification(config)
         model.to(torch_device)
@@ -186,15 +176,17 @@ def test_model_from_pretrained(self):
 # We will verify our results on an image of cute cats
 # TODO: use VitImageProcessor in the future
 def prepare_img(image_resolution):
-    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
 
     # standard PyTorch mean-std input image normalization
-    transform = T.Compose([
-        T.Resize((image_resolution,image_resolution)),
-        T.ToTensor(),
-        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-    ])
+    transform = T.Compose(
+        [
+            T.Resize((image_resolution, image_resolution)),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
 
     # mean-std normalize the input image (batch-size: 1)
     img = transform(im).unsqueeze(0)
@@ -217,8 +209,6 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor(
-            [-0.7332,  0.7286, -0.4020]
-        ).to(torch_device)
+        expected_slice = torch.tensor([-0.7332, 0.7286, -0.4020]).to(torch_device)
 
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
\ No newline at end of file
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From 6da3261576193cfc4becb8b9cf76d88d2e31d59a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Mon, 15 Mar 2021 21:01:28 +0100
Subject: [PATCH 14/44] Fix another rebase issue

---
 src/transformers/models/auto/configuration_auto.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index ab6541edfca9af..67e0fee66152c0 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -81,11 +81,8 @@
     (key, value)
     for pretrained_map in [
         # Add archive maps here
-<<<<<<< HEAD
         SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-=======
         VIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
->>>>>>> 8352309bd... First commit - copy from modeling_vit_pytorch
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -186,11 +183,8 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
-<<<<<<< HEAD
         ("speech_to_text", "Speech2Text"),
-=======
         ("vit", "ViT"),
->>>>>>> 8352309bd... First commit - copy from modeling_vit_pytorch
         ("wav2vec2", "Wav2Vec2"),
         ("m2m_100", "M2M100"),
         ("convbert", "ConvBERT"),

From d48609a60a54abee8504beff9d491167d697438d Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Mon, 15 Mar 2021 21:03:45 +0100
Subject: [PATCH 15/44] Fix another rebase issue

---
 src/transformers/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 907d5cbb31c4c3..aeeadda9ba692e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -398,6 +398,8 @@
             "SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
             "Speech2TextForConditionalGeneration",
             "Speech2TextModel",
+        ]
+    )
     _import_structure["models.vit"].extend(
         [
             "VIT_PRETRAINED_MODEL_ARCHIVE_LIST",

From 43524d0aee0071cca793721e226289ec4176e28e Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 10:34:55 +0100
Subject: [PATCH 16/44] Major cleanup - renamed ViTImageProcessor to
 ViTFeatureExtractor

---
 src/transformers/__init__.py                  |   8 +-
 src/transformers/file_utils.py                |  15 +-
 src/transformers/image_processor_utils.py     | 763 ------------------
 src/transformers/models/auto/modeling_auto.py |   2 +
 src/transformers/models/vit/__init__.py       |  10 +-
 .../models/vit/feature_extraction_vit.py      | 141 ++++
 .../models/vit/image_processor_vit.py         | 249 ------
 src/transformers/testing_utils.py             |  14 +
 ..._vit.py => test_feature_extraction_vit.py} |  52 +-
 tests/test_modeling_vit.py                    |   1 +
 10 files changed, 209 insertions(+), 1046 deletions(-)
 delete mode 100644 src/transformers/image_processor_utils.py
 create mode 100644 src/transformers/models/vit/feature_extraction_vit.py
 delete mode 100644 src/transformers/models/vit/image_processor_vit.py
 rename tests/{test_image_processor_vit.py => test_feature_extraction_vit.py} (65%)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index aeeadda9ba692e..0795dbf3e81ce6 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -126,7 +126,7 @@
     ],
     "models": [],
     # Models
-    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig", "ViTImageProcessor"],
+    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig", "ViTFeatureExtractor"],
     "models.wav2vec2": [
         "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Wav2Vec2Config",
@@ -1425,7 +1425,11 @@
         TransfoXLCorpus,
         TransfoXLTokenizer,
     )
-    from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig, ViTImageProcessor
+    from .models.vit import (
+        VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, 
+        ViTConfig, 
+        ViTFeatureExtractor,
+    )
     from .models.wav2vec2 import (
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Wav2Vec2Config,
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 127a8c9eb0c63d..f26f566408bdcb 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -182,14 +182,23 @@
 except importlib_metadata.PackageNotFoundError:
     _soundfile_available = False
 
+
 _torchaudio_available = importlib.util.find_spec("torchaudio")
 try:
     _torchaudio_version = importlib_metadata.version("torchaudio")
-    logger.debug(f"Successfully imported soundfile version {_torchaudio_version}")
+    logger.debug(f"Successfully imported torchaudio version {_torchaudio_version}")
 except importlib_metadata.PackageNotFoundError:
     _torchaudio_available = False
 
 
+_torchvision_available = importlib.util.find_spec("torchvision")
+try:
+    _torchvision_version = importlib_metadata.version("torchvision")
+    logger.debug(f"Successfully imported torchvision version {_torchvision_version}")
+except importlib_metadata.PackageNotFoundError:
+    _torchvision_available = False
+
+
 torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
 old_default_cache_path = os.path.join(torch_cache_home, "transformers")
 # New default cache, shared with the Datasets library
@@ -381,6 +390,10 @@ def is_torchaudio_available():
     return _torchaudio_available
 
 
+def is_torchvision_available():
+    return _torchvision_available
+
+
 def torch_only_method(fn):
     def wrapper(*args, **kwargs):
         if not _torch_available:
diff --git a/src/transformers/image_processor_utils.py b/src/transformers/image_processor_utils.py
deleted file mode 100644
index 170adb2e811e91..00000000000000
--- a/src/transformers/image_processor_utils.py
+++ /dev/null
@@ -1,763 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
- Image processor common class for python image processors.
-
- Based on https://github.com/huggingface/transformers/blob/master/src/transformers/feature_extraction_utils.py, but
- PreTrainedFeatureExtractor -> PreTrainedImageProcessor, BatchFeature -> BatchImages, and so on.
-"""
-import copy
-import json
-import os
-from collections import UserDict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from torch import Tensor
-
-from .file_utils import (
-    PaddingStrategy,
-    TensorType,
-    _is_jax,
-    _is_numpy,
-    _is_tensorflow,
-    _is_torch,
-    _is_torch_device,
-    add_end_docstrings,
-    cached_path,
-    hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_tf_available,
-    is_torch_available,
-    to_py_obj,
-    torch_required,
-)
-from .utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-if TYPE_CHECKING:
-    if is_torch_available():
-        import torch
-
-
-class BatchImages(UserDict):
-    r"""
-    Holds the output of the :meth:`~transformers.PreTrainedImageProcessor.pad` and image processor specific
-    ``__call__`` methods.
-
-    This class is derived from a python dictionary and can be used as a dictionary.
-
-
-    Args:
-        data (:obj:`dict`):
-            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'pixel_mask',
-            etc.).
-        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
-            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
-            initialization.
-    """
-
-    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
-        super().__init__(data)
-        self.convert_to_tensors(tensor_type=tensor_type)
-
-    def __getitem__(self, item: str) -> Union[Any]:
-        """
-        If the key is a string, returns the value of the dict associated to :obj:`key` ('pixel_values', 'pixel_mask',
-        etc.).
-        """
-        if isinstance(item, str):
-            return self.data[item]
-        else:
-            raise KeyError("Indexing with integers is not available when using Python based image processors")
-
-    def __getattr__(self, item: str):
-        try:
-            return self.data[item]
-        except KeyError:
-            raise AttributeError
-
-    def __getstate__(self):
-        return {"data": self.data}
-
-    def __setstate__(self, state):
-        if "data" in state:
-            self.data = state["data"]
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.keys
-    def keys(self):
-        return self.data.keys()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.values
-    def values(self):
-        return self.data.values()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.items
-    def items(self):
-        return self.data.items()
-
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
-        """
-        Convert the inner content to tensors.
-
-
-        Args:
-            tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
-                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
-                :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
-        """
-        if tensor_type is None:
-            return self
-
-        # Convert to TensorType
-        if not isinstance(tensor_type, TensorType):
-            tensor_type = TensorType(tensor_type)
-
-        # Get a function reference for the correct framework
-        if tensor_type == TensorType.TENSORFLOW:
-            if not is_tf_available():
-                raise ImportError(
-                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
-                )
-            import tensorflow as tf
-
-            as_tensor = tf.constant
-            is_tensor = tf.is_tensor
-        elif tensor_type == TensorType.PYTORCH:
-            if not is_torch_available():
-                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
-            import torch
-
-            as_tensor = torch.tensor
-            is_tensor = torch.is_tensor
-        elif tensor_type == TensorType.JAX:
-            if not is_flax_available():
-                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
-            import jax.numpy as jnp  # noqa: F811
-
-            as_tensor = jnp.array
-            is_tensor = _is_jax
-        else:
-            as_tensor = np.asarray
-            is_tensor = _is_numpy
-
-        # Do the tensor conversion in batch
-        for key, value in self.items():
-            try:
-                if not is_tensor(value):
-                    tensor = as_tensor(value)
-
-                    self[key] = tensor
-            except:  # noqa E722
-                if key == "overflowing_values":
-                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
-                raise ValueError(
-                    "Unable to create tensor, you should probably activate padding "
-                    "with 'padding=True' to have batched tensors with the same resolution."
-                )
-
-        return self
-
-    @torch_required
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchImages
-    def to(self, device: Union[str, "torch.device"]) -> "BatchImages":
-        """
-        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
-
-
-        Args:
-            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
-
-
-        Returns:
-            :class:`~transformers.BatchImages`: The same instance of :class:`~transformers.BatchImages` after
-            modification.
-        """
-
-        # This check catches things like APEX blindly calling "to" on all inputs to a module
-        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
-        # into a HalfTensor
-        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items()}
-        else:
-            logger.warning(f"Attempting to cast a BatchImages to type {str(device)}. This is not supported.")
-        return self
-
-
-class PreTrainedImageProcessor:
-    """
-    This is a general image processor class for vision-related tasks.
-
-
-    Args:
-        image_mean (:obj:`List[float]`):
-            The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[Float]`):
-            The sequence of standard deviations for each channel, to be used when normalizing images.
-        padding_value (:obj:`float`):
-            The value that is used to fill the padding pixels.
-    """
-
-    def __init__(self, image_mean: int, image_std: int, padding_value: float, **kwargs):
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.padding_value = padding_value
-
-        self.return_pixel_mask = kwargs.pop("return_pixel_mask", True)
-
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error(f"Can't set {key} with value {value} for {self}")
-                raise err
-
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> "PreTrainedImageProcessor":
-        r"""
-        Instantiate a :class:`~transformers.PreTrainedImageProcessor` (or a derived class) from a pretrained image
-        processor.
-
-
-        Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
-                This can be either:
-
-
-                - a string, the `model id` of a pretrained image_processor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a image processor file saved using the
-                  :func:`~transformers.PreTrainedImageProcessor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved image processor JSON `file`, e.g.,
-                  ``./my_model_directory/feature_extraction_config.json``.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
-                Path to a directory in which a downloaded pretrained model image processor should be cached if the
-                standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions if
-                they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
-                exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
-                identifier allowed by git.
-            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If :obj:`False`, then this function returns just the final image processor object.
-
-                If :obj:`True`, then this functions returns a :obj:`Tuple(image_processor, unused_kwargs)` where
-                `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not image processor
-                attributes: i.e., the part of ``kwargs`` which has not been used to update ``image_processor`` and is
-                otherwise ignored.
-            kwargs (:obj:`Dict[str, Any]`, `optional`):
-                The values in kwargs of any keys which are image processor attributes will be used to override the
-                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
-                controlled by the ``return_unused_kwargs`` keyword parameter.
-
-        .. note::
-
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
-
-
-
-        Returns:
-            :class:`~transformers.PreTrainedImageProcessor`: The image processor object instantiated from this
-            pretrained model.
-
-
-        Examples::
-
-            # We can't instantiate directly the base class `PreTrainedImageProcessor` so let's show the examples on a
-            # derived class: DetrImageProcessor
-            image_processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50')    # Download image_processor_config from huggingface.co and cache.
-            image_processor = DetrImageProcessor.from_pretrained('./test/saved_model/')  # E.g. image_processor (or model) was saved using `save_pretrained('./test/saved_model/')`
-            image_processor = DetrImageProcessor.from_pretrained('./test/saved_model/image_processor_config.json')
-            image_processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50', return_pixel_mask=False, foo=False)
-            assert image_processor.return_pixel_mask is False
-            image_processor, unused_kwargs = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50', return_pixel_mask=False,
-                                                               foo=False, return_unused_kwargs=True)
-            assert image_processor.return_pixel_mask is False
-            assert unused_kwargs == {'foo': False}
-
-        """
-        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
-
-        return cls.from_dict(image_processor_dict, **kwargs)
-
-    def save_pretrained(self, save_directory: Union[str, os.PathLike]):
-        """
-        Save a image_processor object to the directory ``save_directory``, so that it can be re-loaded using the
-        :func:`~transformers.PreTrainedImageProcessor.from_pretrained` class method.
-
-
-        Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
-                Directory where the image processor JSON file will be saved (will be created if it does not exist).
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-        os.makedirs(save_directory, exist_ok=True)
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_image_processor_file = os.path.join(save_directory, FEATURE_EXTRACTOR_NAME)
-
-        self.to_json_file(output_image_processor_file)
-        logger.info(f"Configuration saved in {output_image_processor_file}")
-
-    @classmethod
-    def get_image_processor_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        """
-        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
-        :class:`~transformers.PreTrainedImageProcessor` using ``from_dict``.
-
-
-        Parameters:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
-                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
-
-
-        Returns:
-            :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
-        """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if os.path.isdir(pretrained_model_name_or_path):
-            image_processor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            image_processor_file = pretrained_model_name_or_path
-        else:
-            image_processor_file = hf_bucket_url(
-                pretrained_model_name_or_path, filename=FEATURE_EXTRACTOR_NAME, revision=revision, mirror=None
-            )
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_image_processor_file = cached_path(
-                image_processor_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-            )
-            # Load image_processor dict
-            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
-                text = reader.read()
-            image_processor_dict = json.loads(text)
-
-        except EnvironmentError as err:
-            logger.error(err)
-            msg = (
-                f"Can't load image processor for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
-                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
-                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {FEATURE_EXTRACTOR_NAME} file\n\n"
-            )
-            raise EnvironmentError(msg)
-
-        except json.JSONDecodeError:
-            msg = (
-                f"Couldn't reach server at '{image_processor_file}' to download image processor configuration file or "
-                "image processor configuration file is not a valid JSON file. "
-                f"Please check network or file content here: {resolved_image_processor_file}."
-            )
-            raise EnvironmentError(msg)
-
-        if resolved_image_processor_file == image_processor_file:
-            logger.info(f"loading image processor configuration file {image_processor_file}")
-        else:
-            logger.info(
-                f"loading image processor configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
-            )
-
-        return image_processor_dict, kwargs
-
-    @classmethod
-    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs) -> "PreTrainedImageProcessor":
-        """
-        Instantiates a :class:`~transformers.PreTrainedImageProcessor` from a Python dictionary of parameters.
-
-
-        Args:
-            image_processor_dict (:obj:`Dict[str, Any]`):
-                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
-                retrieved from a pretrained checkpoint by leveraging the
-                :func:`~transformers.PreTrainedImageProcessor.to_dict` method.
-            kwargs (:obj:`Dict[str, Any]`):
-                Additional parameters from which to initialize the image processor object.
-
-
-        Returns:
-            :class:`~transformers.PreTrainedImageProcessor`: The image processor object instantiated from those
-            parameters.
-        """
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        image_processor = cls(**image_processor_dict)
-
-        # Update image_processor with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(image_processor, key):
-                setattr(image_processor, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info(f"Image processor {image_processor}")
-        if return_unused_kwargs:
-            return image_processor, kwargs
-        else:
-            return image_processor
-
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary.
-
-
-        Returns:
-            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
-        """
-        output = copy.deepcopy(self.__dict__)
-
-        return output
-
-    @classmethod
-    def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PreTrainedImageProcessor":
-        """
-        Instantiates a :class:`~transformers.PreTrainedImageProcessor` from the path to a JSON file of parameters.
-
-
-        Args:
-            json_file (:obj:`str` or :obj:`os.PathLike`):
-                Path to the JSON file containing the parameters.
-
-
-        Returns:
-            :class:`~transformers.PreTrainedImageProcessor`: The image_processor object instantiated from that JSON
-            file.
-
-        """
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        image_processor_dict = json.loads(text)
-        return cls(**image_processor_dict)
-
-    def to_json_string(self) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-
-        Returns:
-            :obj:`str`: String containing all the attributes that make up this image_processor instance in JSON format.
-        """
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
-        """
-        Save this instance to a JSON file.
-
-
-        Args:
-            json_file_path (:obj:`str` or :obj:`os.PathLike`):
-                Path to the JSON file in which this image_processor instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    def pad(
-        self,
-        processed_images: Union[
-            BatchImages,
-            List[BatchImages],
-            Dict[str, BatchImages],
-            Dict[str, List[BatchImages]],
-            List[Dict[str, BatchImages]],
-        ],
-        padding: Union[bool, str, PaddingStrategy] = True,
-        max_resolution: Optional[int] = None,
-        pad_to_multiple_of: Optional[int] = None,
-        return_pixel_mask: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> BatchImages:
-        """
-        Pad input values or a batch of input values up to predefined resolution or to the max resolution in the batch.
-
-        Padding values are defined at the image processor level (with ``self.padding_value``).
-
-        .. note::
-
-            If the ``processed_images`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
-            the result will use the same type unless you provide a different tensor type with ``return_tensors``. In
-            the case of PyTorch tensors, you will lose the specific device of your tensors however.
-
-
-        Args:
-            processed_images (:class:`~transformers.BatchImages`, list of :class:`~transformers.BatchImages`, :obj:`Dict[str, List[float]]`, :obj:`Dict[str, List[List[float]]` or :obj:`List[Dict[str, List[float]]]`):
-                Processed inputs. Can represent one input (:class:`~transformers.BatchImages` or :obj:`Dict[str,
-                List[float]]`) or a batch of input values / vectors (list of :class:`~transformers.BatchImages`,
-                `Dict[str, List[List[float]]]` or `List[Dict[str, List[float]]]`) so you can use this method during
-                preprocessing as well as in a PyTorch Dataloader collate function.
-
-                Instead of :obj:`List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
-                tensors), see the note above for the return type.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-
-                * :obj:`True` or :obj:`'biggest'`: Pad to the biggest image in the batch (or no padding if only a
-                  single image if provided).
-                * :obj:`'max_resolution'`: Pad to a maximum resolution specified with the argument
-                  :obj:`max_resolution` or to the maximum acceptable input resolution for the model if that argument is
-                  not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-                  different resolutions).
-            max_resolution (:obj:`int`, `optional`):
-                Maximum resolution of the returned list and optionally padding length (see above).
-            pad_to_multiple_of (:obj:`int`, `optional`):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_pixel_mask (:obj:`bool`, `optional`):
-                Whether to return the pixel mask. If left to the default, will return the pixel mask according to the
-                specific image_processor's default.
-
-                `What are pixel masks? <../glossary.html#attention-mask>`__
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-        """
-        # If we have a list of dicts, let's convert it in a dict of lists
-        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
-        if isinstance(processed_images, (list, tuple)) and isinstance(processed_images[0], (dict, BatchImages)):
-            processed_images = {
-                key: [example[key] for example in processed_images] for key in processed_images[0].keys()
-            }
-
-        # The model's main input name, usually `pixel_values`, has be passed for padding
-        if self.model_input_names[0] not in processed_images:
-            raise ValueError(
-                "You should supply an instance of :class:`~transformers.BatchImages` or list of :class:`~transformers.BatchImages` to this method"
-                f"that includes {self.model_input_names[0]}, but you provided {list(processed_images.keys())}"
-            )
-
-        required_input = processed_images[self.model_input_names[0]]
-        return_pixel_mask = return_pixel_mask if return_pixel_mask is not None else self.return_pixel_mask
-
-        if not required_input:
-            if return_pixel_mask:
-                processed_images["pixel_mask"] = []
-            return processed_images
-
-        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
-        # and rebuild them afterwards if no return_tensors is specified
-        # Note that we lose the specific device the tensor may be on for PyTorch
-
-        first_element = required_input[0]
-        if isinstance(first_element, (list, tuple)):
-            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
-            index = 0
-            while len(required_input[index]) == 0:
-                index += 1
-            if index < len(required_input):
-                first_element = required_input[index][0]
-        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
-        if not isinstance(first_element, (float, int, list, tuple)):
-            if is_tf_available() and _is_tensorflow(first_element):
-                return_tensors = "tf" if return_tensors is None else return_tensors
-            elif is_torch_available() and _is_torch(first_element):
-                return_tensors = "pt" if return_tensors is None else return_tensors
-            elif isinstance(first_element, np.ndarray):
-                return_tensors = "np" if return_tensors is None else return_tensors
-            else:
-                raise ValueError(
-                    f"type of {first_element} unknown: {type(first_element)}. "
-                    f"Should be one of a python, numpy, pytorch or tensorflow object."
-                )
-
-            for key, value in processed_images.items():
-                processed_images[key] = to_py_obj(value)
-
-        # Convert padding_strategy in PaddingStrategy
-        padding_strategy, max_resolution, _ = self._get_padding_strategies(
-            padding=padding, max_resolution=max_resolution
-        )
-
-        required_input = processed_images[self.model_input_names[0]]
-        if required_input and not isinstance(required_input[0], (list, tuple)):
-            processed_images = self._pad(
-                processed_images,
-                max_resolution=max_resolution,
-                padding_strategy=padding_strategy,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_pixel_mask=return_pixel_mask,
-            )
-            return BatchImages(processed_images, tensor_type=return_tensors)
-
-        batch_size = len(required_input)
-        assert all(
-            len(v) == batch_size for v in processed_images.values()
-        ), "Some items in the output dictionary have a different batch size than others."
-
-        if padding_strategy == PaddingStrategy.BIGGEST:
-            max_resolution = max(len(inputs) for inputs in required_input)
-            padding_strategy = PaddingStrategy.MAX_RESOLUTION
-
-        batch_outputs = {}
-        for i in range(batch_size):
-            inputs = dict((k, v[i]) for k, v in processed_images.items())
-            outputs = self._pad(
-                inputs,
-                max_resolution=max_resolution,
-                padding_strategy=padding_strategy,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_pixel_mask=return_pixel_mask,
-            )
-
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-
-        return BatchImages(batch_outputs, tensor_type=return_tensors)
-
-    def _pad(
-        self,
-        processed_images: Union[Dict[str, List[float]], BatchImages],
-        max_resolution: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        return_pixel_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad inputs (up to predefined resolution or max resolution in the batch)
-
-
-        Args:
-            processed_images: Dictionary of input values (`List[float]`) / input vectors (`List[List[float]]`) or batch of inputs values (`List[List[int]]`) / input vectors (`List[List[List[int]]]`)
-            max_resolution: maximum resolution of the returned list and optionally padding length (see below)
-            padding_strategy: PaddingStrategy to use for padding.
-
-
-                - PaddingStrategy.BIGGEST Pad to the biggest image in the batch (default)
-                - PaddingStrategy.MAX_RESOLUTION: Pad to the max resolution
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_pixel_mask: (optional) Set to False to avoid returning pixel mask (default: set to model specifics)
-        """
-        required_input = processed_images[self.model_input_names[0]]
-
-        if padding_strategy == PaddingStrategy.BIGGEST:
-            max_resolution = len(required_input)
-
-        if (
-            max_resolution is not None
-            and pad_to_multiple_of is not None
-            and (max_resolution % pad_to_multiple_of != 0)
-        ):
-            max_resolution = ((max_resolution // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_resolution
-
-        if needs_to_be_padded:
-            difference = max_resolution - len(required_input)
-            padding_vector = self.feature_size * [self.padding_value] if self.feature_size > 1 else self.padding_value
-            # if self.padding_side == "right":
-            #     if return_pixel_mask:
-            #         processed_images["pixel_mask"] = [1] * len(required_input) + [0] * difference
-            #     processed_images[self.model_input_names[0]] = required_input + [
-            #         padding_vector for _ in range(difference)
-            #     ]
-            # elif self.padding_side == "left":
-            #     if return_pixel_mask:
-            #         processed_images["pixel_mask"] = [0] * difference + [1] * len(required_input)
-            #     processed_images[self.model_input_names[0]] = [
-            #         padding_vector for _ in range(difference)
-            #     ] + required_input
-            # else:
-            #     raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-        elif return_pixel_mask and "pixel_mask" not in processed_images:
-            processed_images["pixel_mask"] = [1] * len(required_input)
-
-        return processed_images
-
-    def _get_padding_strategies(self, padding=False, max_resolution=None, pad_to_multiple_of=None, **kwargs):
-        """
-        Find the correct padding strategy
-        """
-
-        # Get padding strategy
-        if padding is not False:
-            if padding is True:
-                padding_strategy = PaddingStrategy.BIGGEST  # Default to pad to the biggest image in the batch
-            elif not isinstance(padding, PaddingStrategy):
-                padding_strategy = PaddingStrategy(padding)
-            elif isinstance(padding, PaddingStrategy):
-                padding_strategy = padding
-        else:
-            padding_strategy = PaddingStrategy.DO_NOT_PAD
-
-        # Set max resolution if needed
-        if max_resolution is None:
-            if padding_strategy == PaddingStrategy.MAX_RESOLUTION:
-                raise ValueError(
-                    f"When setting ``padding={PaddingStrategy.MAX_RESOLUTION}``, make sure that"
-                    f" max_resolution is defined"
-                )
-
-        # Test if we have a padding value
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None):
-            raise ValueError(
-                "Asking to pad but the image_processor does not have a padding value. "
-                "Please select a value to use as `padding_value`. For example: `image_processor.padding_value = 0.0`."
-            )
-
-        return padding_strategy, max_resolution, kwargs
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d0dca675ae7e86..692d2c42a11799 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -21,6 +21,8 @@
 from ...configuration_utils import PretrainedConfig
 from ...file_utils import add_start_docstrings
 from ...utils import logging
+
+# Add modeling imports here
 from ..albert.modeling_albert import (
     AlbertForMaskedLM,
     AlbertForMultipleChoice,
diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
index 69f30c7ae8a124..31333adfd718bc 100644
--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -17,14 +17,16 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available
+from ...file_utils import _BaseLazyModule, is_torch_available, is_torchvision_available
 
 
 _import_structure = {
     "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
-    "image_processor_vit": ["ViTImageProcessor"],
 }
 
+if is_torchvision_available():
+    _import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
+
 if is_torch_available():
     _import_structure["modeling_vit"] = [
         "VIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -37,8 +39,10 @@
 
 if TYPE_CHECKING:
     from .configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
-    from .image_processor_vit import ViTImageProcessor
 
+    if is_torchvision_available():
+        from .feature_extraction_vit import ViTFeatureExtractor
+    
     if is_torch_available():
         from .modeling_vit import (
             VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
new file mode 100644
index 00000000000000..124fdae53c23e7
--- /dev/null
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for ViT."""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from torchvision import transforms as T
+
+from ...file_utils import PaddingStrategy, TensorType
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ViTFeatureExtractor(FeatureExtractionMixin):
+    r"""
+    Constructs a ViT feature extractor. 
+    
+    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main methods. 
+    Users should refer to this superclass for more information regarding those methods.
+    
+    Args:
+        image_mean (:obj:`int`, defaults to [0.485, 0.456, 0.406]):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (:obj:`int`, defaults to [0.229, 0.224, 0.225]):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to resize the input to a certain :obj:`size`.
+        size (:obj:`int`, `optional`, defaults to :obj:`224`):
+            Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.229, 0.224, 0.225],
+        do_normalize=True,
+        do_resize=True,
+        size=224,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_normalize = do_normalize
+        self.do_resize = do_resize
+        self.size = size
+
+    def __call__(
+        self,
+        images: Union[
+            PIL.Image.Image, np.ndarray, torch.Tensor, List[PIL.Image.Image], List[np.ndarray], List[torch.Tensor]
+        ],
+        **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s).
+        Args:
+            images (:obj:`PIL.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, numpy array or a Torch
+                tensor.
+        """
+        # Input type checking for clearer error
+        assert (
+            isinstance(images, PIL.Image.Image)
+            or isinstance(images, np.ndarray)
+            or isinstance(images, torch.Tensor)
+            or (
+                (
+                    isinstance(images, (list, tuple))
+                    and (
+                        len(images) == 0
+                        or (
+                            isinstance(images[0], PIL.Image.Image)
+                            or isinstance(images[0], np.ndarray)
+                            or isinstance(images[0], torch.Tensor)
+                        )
+                    )
+                )
+            )
+        ), (
+            "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
+            "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+        )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple)) and (isinstance(images[0], (PIL.Image.Image, np.ndarray, torch.Tensor)))
+        )
+
+        # step 1: make images a list of PIL images no matter what
+        if is_batched:
+            if isinstance(images[0], np.ndarray):
+                images = [Image.fromarray(image).convert("RGB") for image in images]
+            elif isinstance(images[0], torch.Tensor):
+                images = [T.ToPILImage()(image).convert("RGB") for image in images]
+        else:
+            if isinstance(images, np.ndarray):
+                images = [Image.fromarray(images).convert("RGB")]
+            elif isinstance(images, torch.Tensor):
+                images = [T.ToPILImage()(images).convert("RGB")]
+            else:
+                images = [images]
+
+        # step 2: define transformations (resizing + normalization)
+        transformations = []
+        if self.do_resize and self.size is not None:
+            transformations.append(T.Resize(size=(self.size, self.size)))
+        if self.do_normalize:
+            normalization = T.Compose([T.ToTensor(), T.Normalize(self.image_mean, self.image_std)])
+            transformations.append(normalization)
+        transforms = T.Compose(transformations)
+
+        # step 3: apply transformations to images
+        pixel_values = [transforms(image) for image in images]
+
+        # return as BatchFeature
+        data = {"pixel_values": pixel_values}
+        encoded_inputs = BatchFeature(data=data)
+
+        return encoded_inputs
diff --git a/src/transformers/models/vit/image_processor_vit.py b/src/transformers/models/vit/image_processor_vit.py
deleted file mode 100644
index 102dc7d302afc1..00000000000000
--- a/src/transformers/models/vit/image_processor_vit.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# coding=utf-8
-# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for ViT."""
-
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-import torchvision
-from torchvision import transforms as T
-
-from ...file_utils import PaddingStrategy, TensorType
-from ...image_processor_utils import BatchImages, PreTrainedImageProcessor
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-## BELOW: utilities copied from
-## https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/util/misc.py
-
-
-def _max_by_axis(the_list):
-    # type: (List[List[int]]) -> List[int]
-    maxes = the_list[0]
-    for sublist in the_list[1:]:
-        for index, item in enumerate(sublist):
-            maxes[index] = max(maxes[index], item)
-    return maxes
-
-
-class NestedTensor(object):
-    """
-    Data type that handles different types of inputs (either list of images or list of sequences), and computes the
-    padded output (with masking).
-    """
-
-    def __init__(self, tensors, mask: Optional[torch.Tensor]):
-        self.tensors = tensors
-        self.mask = mask
-
-    def to(self, device):
-        # type: (Device) -> NestedTensor # noqa
-        cast_tensor = self.tensors.to(device)
-        mask = self.mask
-        if mask is not None:
-            assert mask is not None
-            cast_mask = mask.to(device)
-        else:
-            cast_mask = None
-        return NestedTensor(cast_tensor, cast_mask)
-
-    def decompose(self):
-        return self.tensors, self.mask
-
-    def __repr__(self):
-        return str(self.tensors)
-
-
-def nested_tensor_from_tensor_list(tensor_list: Union[List[torch.Tensor], torch.Tensor]):
-    # TODO make this more n
-    if tensor_list[0].ndim == 3:
-        if torchvision._is_tracing():
-            # nested_tensor_from_tensor_list() does not export well to ONNX
-            # call _onnx_nested_tensor_from_tensor_list() instead
-            return _onnx_nested_tensor_from_tensor_list(tensor_list)
-
-        # TODO make it support different-sized images
-        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
-        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
-        batch_shape = [len(tensor_list)] + max_size
-        b, c, h, w = batch_shape
-        dtype = tensor_list[0].dtype
-        device = tensor_list[0].device
-        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
-        mask = torch.zeros((b, h, w), dtype=torch.bool, device=device)
-        for img, pad_img, m in zip(tensor_list, tensor, mask):
-            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], : img.shape[2]] = True
-    else:
-        raise ValueError("Not supported")
-    return NestedTensor(tensor, mask)
-
-
-class ViTImageProcessor(PreTrainedImageProcessor):
-    r"""
-    Constructs a ViT image processor. This image processor inherits from
-    :class:`~transformers.PreTrainedImageProcessor` which contains most of the main methods. Users should refer to this
-    superclass for more information regarding those methods.
-    Args:
-        image_mean (:obj:`int`, defaults to [0.485, 0.456, 0.406]):
-            The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`int`, defaults to [0.229, 0.224, 0.225]):
-            The sequence of standard deviations for each channel, to be used when normalizing images.
-        padding_value (:obj:`float`, defaults to 0.0):
-            The value that is used to fill the padding values.
-        return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not :meth:`~transformers.ViTImageProcessor.__call__` should return :obj:`attention_mask`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to normalize the input with mean and standard deviation.
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int`, `optional`, defaults to :obj:`224`):
-            Resize the input to the given size. Only has an effect if :obj:`resize` is set to :obj:`True`.
-    """
-
-    model_input_names = ["pixel_values", "attention_mask"]
-
-    def __init__(
-        self,
-        image_mean=[0.485, 0.456, 0.406],
-        image_std=[0.229, 0.224, 0.225],
-        padding_value=0.0,
-        return_attention_mask=True,
-        do_normalize=True,
-        do_resize=True,
-        size=224,
-        **kwargs
-    ):
-        super().__init__(image_mean=image_mean, image_std=image_std, padding_value=padding_value, **kwargs)
-        self.return_attention_mask = return_attention_mask
-        self.do_normalize = do_normalize
-        self.do_resize = do_resize
-        self.size = size
-
-    def __call__(
-        self,
-        images: Union[
-            PIL.Image.Image, np.ndarray, torch.Tensor, List[PIL.Image.Image], List[np.ndarray], List[torch.Tensor]
-        ],
-        padding: Union[bool, str, PaddingStrategy] = False,
-        max_resolution: Optional[int] = None,
-        pad_to_multiple_of: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_attention_mask: Optional[bool] = None,
-        verbose: bool = True,
-        **kwargs
-    ) -> BatchImages:
-        """
-        Main method to prepare for the model one or several image(s).
-        Args:
-            images (:obj:`PIL.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, numpy array or a Torch
-                tensor.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
-                Activates and controls padding. Accepts the following values:
-                * :obj:`True` or :obj:`'biggest'`: Pad to the biggest image in the batch (or no padding if only a
-                  single image is provided).
-                * :obj:`'max_resolution'`: Pad to a maximum resolution specified with the argument
-                  :obj:`max_resolution` or to the maximum acceptable input resolution for the model if that argument is
-                  not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with images of
-                  different resolutions).
-            max_resolution (:obj:`int`, `optional`):
-                Controls the maximum resolution to use by one of the truncation/padding parameters. If left unset or
-                set to :obj:`None`, this will use the predefined model maximum resolution if a maximum resolution is
-                required by one of the truncation/padding parameters. If the model has no specific maximum input
-                resolution, truncation/padding to a maximum resolution will be deactivated.
-            pad_to_multiple_of (:obj:`int`, `optional`):
-                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_attention_mask (:obj:`bool`, `optional`):
-                Whether to return the pixel mask. If left to the default, will return the pixel mask according
-                to the specific image processor's default.
-                `What are pixel masks? <../glossary.html#attention-mask>`__
-            return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
-                If set, will return tensors instead of list of python floats. Acceptable values are:
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not to print more information and warnings.
-        """
-        # Input type checking for clearer error
-        assert (
-            isinstance(images, PIL.Image.Image)
-            or isinstance(images, np.ndarray)
-            or isinstance(images, torch.Tensor)
-            or (
-                (
-                    isinstance(images, (list, tuple))
-                    and (
-                        len(images) == 0
-                        or (
-                            isinstance(images[0], PIL.Image.Image)
-                            or isinstance(images[0], np.ndarray)
-                            or isinstance(images[0], torch.Tensor)
-                        )
-                    )
-                )
-            )
-        ), (
-            "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
-            "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
-        )
-
-        is_batched = bool(
-            isinstance(images, (list, tuple)) and (isinstance(images[0], (PIL.Image.Image, np.ndarray, torch.Tensor)))
-        )
-
-        # step 1: make images a list of PIL images no matter what
-        if is_batched:
-            if isinstance(images[0], np.ndarray):
-                images = [Image.fromarray(image).convert("RGB") for image in images]
-            elif isinstance(images[0], torch.Tensor):
-                images = [T.ToPILImage()(image).convert("RGB") for image in images]
-        else:
-            if isinstance(images, np.ndarray):
-                images = [Image.fromarray(images).convert("RGB")]
-            elif isinstance(images, torch.Tensor):
-                images = [T.ToPILImage()(images).convert("RGB")]
-            else:
-                images = [images]
-
-        # step 2: define transformations (resizing + normalization)
-        transformations = []
-        if self.do_resize and self.size is not None:
-            transformations.append(T.Resize(size=(self.size, self.size)))
-        if self.do_normalize:
-            normalization = T.Compose([T.ToTensor(), T.Normalize(self.image_mean, self.image_std)])
-            transformations.append(normalization)
-        transforms = T.Compose(transformations)
-
-        # step 3: apply transformations to images
-        transformed_images = [transforms(image) for image in images]
-
-        # step 4: TO DO: replace by self.pad (which is defined in image_processor_utils.py), which should
-        # take care of padding, creation of attention mask, return_tensors type
-        samples = nested_tensor_from_tensor_list(transformed_images)
-
-        # return as BatchImages
-        data = {"pixel_values": samples.tensors, "attention_mask": samples.mask}
-
-        encoded_inputs = BatchImages(data=data)
-
-        return encoded_inputs
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 55516263680cea..7f682bee749a99 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -39,6 +39,7 @@
     is_torch_available,
     is_torch_tpu_available,
     is_torchaudio_available,
+    is_torchvision_available,
 )
 from .integrations import is_optuna_available, is_ray_available
 
@@ -240,6 +241,19 @@ def require_torchaudio(test_case):
         return test_case
 
 
+def require_torchvision(test_case):
+    """
+    Decorator marking a test that requires torchvision.
+
+    These tests are skipped when torchvision isn't installed.
+
+    """
+    if not is_torchvision_available:
+        return unittest.skip("test requires torchvision")(test_case)
+    else:
+        return test_case
+
+
 def require_tf(test_case):
     """
     Decorator marking a test that requires TensorFlow.
diff --git a/tests/test_image_processor_vit.py b/tests/test_feature_extraction_vit.py
similarity index 65%
rename from tests/test_image_processor_vit.py
rename to tests/test_feature_extraction_vit.py
index 0d483423fcd973..c50783d29c84aa 100644
--- a/tests/test_image_processor_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -21,14 +21,13 @@
 import numpy as np
 import torch
 
-from transformers import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, ViTConfig, ViTImageProcessor
-from transformers.testing_utils import slow
+from transformers import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, ViTConfig, ViTFeatureExtractor
+from transformers.testing_utils import require_torch, require_torchvision, slow
 
-from .test_image_processor_common import ImageProcessorMixin
-from .test_modeling_common import floats_tensor
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
 
 
-class ViTImageProcessorTester(unittest.TestCase):
+class ViTFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -39,8 +38,6 @@ def __init__(
         max_resolution=400,
         image_mean=[0.485, 0.456, 0.406],
         image_std=[0.5, 0.5, 0.5],
-        padding_value=0.0,
-        return_attention_mask=True,
         do_normalize=True,
         do_resize=True,
         size=18,
@@ -53,75 +50,74 @@ def __init__(
         self.max_resolution = max_resolution
         self.image_mean = image_mean
         self.image_std = image_std
-        self.padding_value = padding_value
-        self.return_attention_mask = return_attention_mask
         self.do_normalize = do_normalize
         self.do_resize = do_resize
         self.size = size
 
-    def prepare_image_processor_dict(self):
+    @property
+    def feat_extract_dict(self):
         return {
             "image_mean": self.image_mean,
             "image_std": self.image_std,
-            "padding_value": self.padding_value,
-            "return_attention_mask": self.return_attention_mask,
             "do_normalize": self.do_normalize,
             "do_resize": self.do_resize,
             "size": self.size,
         }
 
     def prepare_inputs_numpy_for_common(self, equal_resolution=False):
+        # TO DO 
         input_size = (self.num_channels, self.image_size, self.image_size)
         image_inputs = torch.randn((self.batch_size, *input_size))
 
         return image_inputs
 
     def prepare_inputs_pytorch_for_common(self, equal_resolution=False):
+        # TO DO
         input_size = (self.num_channels, self.image_size, self.image_size)
         image_inputs = torch.randn((self.batch_size, *input_size))
 
         return image_inputs
 
 
-class ViTImageProcessorTest(ImageProcessorMixin, unittest.TestCase):
-
-    image_processor_class = ViTImageProcessor
+class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
+    feature_extraction_class = ViTFeatureExtractor
+    
     def setUp(self):
-        self.image_processor_tester = VitImageProcessorTester(self)
+        self.feature_extract_tester = ViTFeatureExtractionTester(self)
 
     def test_call_numpy(self):
-        # Initialize image_processor
-        image_processor = self.image_processor_class(**self.image_processor_tester.prepare_image_processor_dict())
+        # Initialize feature_extractor
+        feature_extract = self.feature_extraction_class(**self.feature_extract_tester.feat_extract_dict())
         # create three inputs of resolution 800, 1000, and 1200
         image_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
         np_image_inputs = [np.asarray(speech_input) for speech_input in image_inputs]
 
         # Test not batched input
-        encoded_images_1 = image_processor(image_inputs[0], return_tensors="np").input_values
-        encoded_images_2 = image_processor(np_image_inputs[0], return_tensors="np").input_values
+        encoded_images_1 = feature_extractor(image_inputs[0], return_tensors="np").input_values
+        encoded_images_2 = feature_extractor(np_image_inputs[0], return_tensors="np").input_values
         self.assertTrue(np.allclose(encoded_images_1, encoded_images_2, atol=1e-3))
 
         # Test batched
-        encoded_images_1 = image_processor(image_inputs, return_tensors="np").input_values
-        encoded_images_2 = image_processor(np_image_inputs, return_tensors="np").input_values
+        encoded_images_1 = feature_extractor(image_inputs, return_tensors="np").input_values
+        encoded_images_2 = feature_extractor(np_image_inputs, return_tensors="np").input_values
         for enc_seq_1, enc_seq_2 in zip(encoded_images_1, encoded_images_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
     def test_call_pytorch(self):
-        # Initialize image_processor
-        image_processor = self.image_processor_class(**self.image_processor_tester.prepare_image_processor_dict())
+        # Initialize feature_extractor
+        feature_extract = self.feature_extraction_class(**self.feature_extract_tester.feat_extract_dict())
         # create three inputs of resolution 800, 1000, and 1200
         image_inputs = floats_tensor()
 
         # Test not batched input
-        encoded_images_1 = image_processor(image_inputs[0], return_tensors="pt").input_values
-        encoded_images_2 = image_processor(np_image_inputs[0], return_tensors="pt").input_values
+        encoded_images_1 = feature_extractor(image_inputs[0], return_tensors="pt").input_values
+        encoded_images_2 = feature_extractor(np_image_inputs[0], return_tensors="pt").input_values
         self.assertTrue(np.allclose(encoded_images_1, encoded_images_2, atol=1e-3))
 
         # Test batched
-        encoded_images_1 = image_processor(image_inputs, return_tensors="pt").input_values
-        encoded_images_2 = image_processor(np_image_inputs, return_tensors="pt").input_values
+        encoded_images_1 = feature_extractor(image_inputs, return_tensors="pt").input_values
+        encoded_images_2 = feature_extractor(np_image_inputs, return_tensors="pt").input_values
         for enc_seq_1, enc_seq_2 in zip(encoded_images_1, encoded_images_2):
             self.assertTrue(torch.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index a32c1f9868c25f..c26e54a7c91328 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -17,6 +17,7 @@
 
 import unittest
 
+import torchvision
 import torchvision.transforms as T
 from PIL import Image
 

From b168ee48932af2421623ff78d3cb013af0545d2e Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 11:06:17 +0100
Subject: [PATCH 17/44] Add torch.stack

---
 src/transformers/models/vit/feature_extraction_vit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index 124fdae53c23e7..88d80f0059e49a 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -131,8 +131,9 @@ def __call__(
             transformations.append(normalization)
         transforms = T.Compose(transformations)
 
-        # step 3: apply transformations to images
+        # step 3: apply transformations to images and stack
         pixel_values = [transforms(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
 
         # return as BatchFeature
         data = {"pixel_values": pixel_values}

From c39f1559d639b117edbc4171688c1da25a81de1f Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 11:36:23 +0100
Subject: [PATCH 18/44] Add documentation

---
 docs/source/model_doc/vit.rst | 83 +++++++++++++----------------------
 1 file changed, 30 insertions(+), 53 deletions(-)

diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
index b9082b9562ed79..0e99158302a883 100644
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -16,38 +16,50 @@ ViT
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The ViT model was proposed in `<INSERT PAPER NAME HERE>
-<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+The ViT model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
+<https://arxiv.org/abs/2010.11929>`__  by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, 
+Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. It's the 
+first paper that successfully trains a Transformer encoder on ImageNet, attaining very good results compared to familiar convolutional
+architectures. 
+
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to 
+computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace 
+certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not 
+necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. 
+When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, 
+CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while 
+requiring substantially fewer computational resources to train.*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- To feed images to the Transformer encoder, each image is split into fixed-size patches, which are then linearly embedded. The authors
+  also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
+- The Vision Transformer expects each image to be of the same size (resolution), either 224x224 or 384x384 depending on the checkpoint.
+  One can use :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model. 
+- Both the expected image resolution and patch resolution are reflected in the name of each checkpoint. For example, :obj:`google/vit-base-patch16-224`
+  refers to a base architecture with image resolution 224x224 and patch resolution of 16x16. All checkpoints can be found on the `hub <https://huggingface.co/models?search=vit>`__.
 
-ViTConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The original code (written in JAX) can be found `here <https://github.com/google-research/vision_transformer>`__.
 
-.. autoclass:: transformers.ViTConfig
-    :members:
+Note that we converted the weights from Ross Wightman's `timm library <https://github.com/rwightman/pytorch-image-models>`__, who already converted 
+the weights from JAX to PyTorch. Credits go to him!
 
 
-ViTTokenizer
+ViTConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.ViTTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+.. autoclass:: transformers.ViTConfig
+    :members:
 
 
-ViTTokenizerFast
+ViTFeatureExtractor
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.ViTTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+.. autoclass:: transformers.ViTFeatureExtractor
+    :members: 
 
 
 ViTModel
@@ -57,43 +69,8 @@ ViTModel
     :members: forward
 
 
-ViTForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTForCausalLM
-    :members: forward
-
-
-ViTForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTForMaskedLM
-    :members: forward
-
-
-ViTForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTForSequenceClassification
-    :members: forward
-
-
-ViTForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTForMultipleChoice
-    :members: forward
-
-
-ViTForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTForTokenClassification
-    :members: forward
-
-
-ViTForQuestionAnswering
+ViTForImageClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.ViTForQuestionAnswering
+.. autoclass:: transformers.ViTForImageClassification
     :members: forward
\ No newline at end of file

From 20e3d1eb90241dbb264904c9e78221ffc301a12f Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 11:47:42 +0100
Subject: [PATCH 19/44] Remove test_image_processor_common

---
 tests/test_feature_extraction_vit.py |  44 ++++++--
 tests/test_image_processor_common.py | 147 ---------------------------
 2 files changed, 37 insertions(+), 154 deletions(-)
 delete mode 100644 tests/test_image_processor_common.py

diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index c50783d29c84aa..888bc26ec3e94f 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -55,7 +55,7 @@ def __init__(
         self.size = size
 
     @property
-    def feat_extract_dict(self):
+    def prepare_feat_extract_dict(self):
         return {
             "image_mean": self.image_mean,
             "image_std": self.image_std,
@@ -64,14 +64,14 @@ def feat_extract_dict(self):
             "size": self.size,
         }
 
-    def prepare_inputs_numpy_for_common(self, equal_resolution=False):
+    def prepare_inputs_numpy(self, equal_resolution=False):
         # TO DO 
         input_size = (self.num_channels, self.image_size, self.image_size)
         image_inputs = torch.randn((self.batch_size, *input_size))
 
         return image_inputs
 
-    def prepare_inputs_pytorch_for_common(self, equal_resolution=False):
+    def prepare_inputs_pytorch(self, equal_resolution=False):
         # TO DO
         input_size = (self.num_channels, self.image_size, self.image_size)
         image_inputs = torch.randn((self.batch_size, *input_size))
@@ -86,6 +86,40 @@ class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCa
     def setUp(self):
         self.feature_extract_tester = ViTFeatureExtractionTester(self)
 
+    @property
+    def feat_extract_dict(self):
+        return self.feat_extract_tester.prepare_feat_extract_dict()
+    
+    def test_feat_extract_properties(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feat_extract, "image_mean"))
+        self.assertTrue(hasattr(feat_extract, "image_std"))
+        self.assertTrue(hasattr(feat_extract, "do_normalize"))
+        self.assertTrue(hasattr(feat_extract, "do_resize"))
+        self.assertTrue(hasattr(feat_extract, "size"))
+
+    def test_batch_feature(self):
+        image_inputs = self.feat_extract_tester.prepare_inputs_for_common()
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: image_inputs})
+
+        self.assertTrue(all(len(x) == len(y) for x, y in zip(image_inputs, processed_features[input_name])))
+
+        image_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
+        processed_features = BatchFeature({input_name: image_inputs}, tensor_type="np")
+
+        batch_features_input = processed_features[input_name]
+
+        if len(batch_features_input.shape) < 3:
+            batch_features_input = batch_features_input[:, :, None]
+
+        self.assertTrue(
+            batch_features_input.shape
+            == (self.feat_extract_tester.batch_size, len(image_inputs[0]), self.feat_extract_tester.feature_size)
+        )
+    
     def test_call_numpy(self):
         # Initialize feature_extractor
         feature_extract = self.feature_extraction_class(**self.feature_extract_tester.feat_extract_dict())
@@ -123,7 +157,3 @@ def test_call_pytorch(self):
 
     def test_normalization(self):
         pass
-
-    @slow
-    def test_pretrained_checkpoints_are_set_correctly(self):
-        pass
diff --git a/tests/test_image_processor_common.py b/tests/test_image_processor_common.py
deleted file mode 100644
index ceb437fe38a233..00000000000000
--- a/tests/test_image_processor_common.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import tempfile
-
-import numpy as np
-
-from transformers import BatchImages
-from transformers.testing_utils import require_tf, require_torch
-
-
-class ImageProcessorMixin:
-
-    # to overwrite at image processor specific tests
-    image_processor_tester = None
-    image_processor_class = None
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_common_properties(self):
-        image_processor = self.image_processor_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "image_mean"))
-        self.assertTrue(hasattr(image_processor, "image_std"))
-        self.assertTrue(hasattr(image_processor, "padding_value"))
-
-    def test_image_processor_to_json_string(self):
-        image_processor = self.image_processor_class(**self.image_processor_dict)
-        obj = json.loads(image_processor.to_json_string())
-        for key, value in self.image_processor_dict.items():
-            self.assertEqual(obj[key], value)
-
-    def test_image_processor_to_json_file(self):
-        image_processor_first = self.image_processor_class(**self.image_processor_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "image_processor.json")
-            image_processor_first.to_json_file(json_file_path)
-            image_processor_second = self.image_processor_class.from_json_file(json_file_path)
-
-        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
-
-    def test_image_processor_from_and_save_pretrained(self):
-        image_processor_first = self.image_processor_class(**self.image_processor_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            image_processor_first.save_pretrained(tmpdirname)
-            image_processor_second = self.image_processor_class.from_pretrained(tmpdirname)
-
-        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
-
-    def test_init_without_params(self):
-        image_processor = self.image_processor_class()
-        self.assertIsNotNone(image_processor)
-
-    def test_batch_images_numpy(self):
-        # Define the images + initialize image_processor
-        image_inputs = self.image_processor_tester.prepare_inputs_numpy_for_common()
-        image_processor = self.image_processor_class(**self.image_processor_dict)
-        input_name = image_processor.model_input_names[0]
-
-        processed_images = BatchImages({input_name: image_inputs})
-
-        self.assertTrue(all(len(x) == len(y) for x, y in zip(image_inputs, processed_images[input_name])))
-
-        image_inputs = self.image_processor_tester.prepare_inputs_numpy_for_common(equal_resolution=True)
-        processed_images = BatchImages({input_name: image_inputs}, tensor_type="np")
-
-        batch_images_input = processed_images[input_name]
-
-        if len(batch_images_input.shape) < 3:
-            batch_images_input = batch_images_input[:, :, None]
-
-        # self.assertTrue(
-        #     batch_images_input.shape
-        #     == (self.image_processor_tester.batch_size, len(image_inputs[0]), self.image_processor_tester.feature_size)
-        # )
-
-    @require_torch
-    def test_batch_images_pt(self):
-        image_inputs = self.image_processor_tester.prepare_inputs_pytorch_for_common(equal_length=True)
-        image_processor = self.image_processor_class(**self.image_processor_dict)
-        input_name = image_processor.model_input_names[0]
-
-        processed_images = BatchImages({input_name: image_inputs}, tensor_type="pt")
-
-        batch_images_input = processed_images[input_name]
-
-        if len(batch_images_input.shape) < 3:
-            batch_images_input = batch_images_input[:, :, None]
-
-        # self.assertTrue(
-        #     batch_images_input.shape
-        #     == (self.image_processor_tester.batch_size, len(image_inputs[0]), self.image_processor_tester.feature_size)
-        # )
-
-    @require_tf
-    def test_batch_images_tf(self):
-        pass
-
-    def _check_padding(self, numpify=False):
-        pass
-
-    def test_padding_from_list(self):
-        self._check_padding(numpify=False)
-
-    def test_padding_from_array(self):
-        self._check_padding(numpify=True)
-
-    @require_torch
-    def test_padding_accepts_tensors_pt(self):
-        pass
-
-    @require_tf
-    def test_padding_accepts_tensors_tf(self):
-        pass
-
-    def test_attention_mask(self):
-        feat_dict = self.image_processor_dict
-        feat_dict["return_attention_mask"] = True
-        image_processor = self.image_processor_class(**feat_dict)
-        image_inputs = self.image_processor_tester.prepare_inputs_pytorch_for_common()
-        input_lenghts = [len(x) for x in image_inputs]
-        input_name = image_processor.model_input_names[0]
-
-        processed = BatchImages({input_name: image_inputs})
-
-        processed = image_processor.pad(processed, padding="biggest", return_tensors="np")
-        self.assertIn("attention_mask", processed)
-        self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
-        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)

From b2b34328b9c70e44ac434ba75643ea56c5c915c8 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 13:43:26 +0100
Subject: [PATCH 20/44] Improve model tests

---
 tests/test_modeling_vit.py | 40 +++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index c26e54a7c91328..5cbe12be929b33 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -16,6 +16,7 @@
 
 
 import unittest
+import inspect
 
 import torchvision
 import torchvision.transforms as T
@@ -26,7 +27,9 @@
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+torch_device = 'cpu'
 
 
 if is_torch_available():
@@ -45,7 +48,6 @@ def __init__(
         patch_size=2,
         num_channels=3,
         is_training=True,
-        use_input_mask=True,
         use_labels=True,
         hidden_size=32,
         num_hidden_layers=5,
@@ -65,7 +67,6 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.is_training = is_training
-        self.use_input_mask = use_input_mask
         self.use_labels = use_labels
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -82,10 +83,6 @@ def __init__(
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
         image_labels = None
         if self.use_labels:
             image_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
@@ -105,22 +102,22 @@ def prepare_config_and_inputs(self):
             initializer_range=self.initializer_range,
         )
 
-        return config, pixel_values, input_mask, image_labels
+        return config, pixel_values, image_labels
 
-    def create_and_check_model(self, config, pixel_values, input_mask, image_labels):
+    def create_and_check_model(self, config, pixel_values, image_labels):
         model = ViTModel(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values, attention_mask=input_mask)
+        result = model(pixel_values)
         result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_image_classification(self, config, pixel_values, input_mask, image_labels):
+    def create_and_check_for_image_classification(self, config, pixel_values, image_labels):
         config.num_labels = self.num_labels
         model = ViTForImageClassification(config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values, attention_mask=input_mask, labels=image_labels)
+        result = model(pixel_values, labels=image_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
     def prepare_config_and_inputs_for_common(self):
@@ -128,10 +125,9 @@ def prepare_config_and_inputs_for_common(self):
         (
             config,
             pixel_values,
-            input_mask,
             image_labels,
         ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values, "attention_mask": input_mask}
+        inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
 
 
@@ -159,6 +155,22 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def test_inputs_embeds(self):
+        # ViT does not use inputs_embeds
+        pass
+    
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)

From 43ba11f9f8f90ab1e00948f02e85ef4db0f9724a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 19:14:04 +0100
Subject: [PATCH 21/44] Add is_torchvision_available to general init of vit

---
 docs/source/model_doc/vit.rst | 4 ++--
 src/transformers/__init__.py  | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
index 0e99158302a883..533c47c9a61ff2 100644
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -10,13 +10,13 @@
     an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
     specific language governing permissions and limitations under the License.
 
-ViT
+Vision Transformer (ViT)
 -----------------------------------------------------------------------------------------------------------------------
 
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The ViT model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
+The Vision Transformer (ViT) model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
 <https://arxiv.org/abs/2010.11929>`__  by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, 
 Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. It's the 
 first paper that successfully trains a Transformer encoder on ImageNet, attaining very good results compared to familiar convolutional
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0795dbf3e81ce6..39ce5ce772af21 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -126,7 +126,7 @@
     ],
     "models": [],
     # Models
-    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig", "ViTFeatureExtractor"],
+    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
     "models.wav2vec2": [
         "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Wav2Vec2Config",
@@ -296,7 +296,7 @@
         name for name in dir(dummy_sentencepiece_objects) if not name.startswith("_")
     ]
 
-# tokenziers-backed objects
+# tokenizers-backed objects
 if is_tokenizers_available():
     # Fast tokenizers
     _import_structure["models.convbert"].append("ConvBertTokenizerFast")
@@ -407,7 +407,6 @@
             "ViTLayer",
             "ViTModel",
             "ViTPreTrainedModel",
-            "load_tf_weights_in_vit",
         ]
     )
 
@@ -1550,6 +1549,9 @@
     else:
         from .utils.dummy_tokenizers_objects import *
 
+    if is_torchvision_available():
+        from .models.vit import ViTFeatureExtractor
+    
     # Modeling
     if is_torch_available():
 
@@ -1953,7 +1955,6 @@
             ViTLayer,
             ViTModel,
             ViTPreTrainedModel,
-            load_tf_weights_in_vit,
         )
         from .models.wav2vec2 import (
             WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,

From 4fb8def78955db081ba2b16a5cffd637c8044e2d Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 19:33:21 +0100
Subject: [PATCH 22/44] Fix import of ViTFeatureExtractor

---
 src/transformers/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 39ce5ce772af21..f56a32e027e929 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -48,6 +48,7 @@
     is_tf_available,
     is_tokenizers_available,
     is_torch_available,
+    is_torchvision_available,
 )
 from .utils import logging
 
@@ -104,6 +105,7 @@
         "is_tokenizers_available",
         "is_torch_available",
         "is_torch_tpu_available",
+        "is_torchvision_available",
     ],
     "hf_argparser": ["HfArgumentParser"],
     "integrations": [
@@ -1259,6 +1261,9 @@
         name for name in dir(dummy_flax_objects) if not name.startswith("_")
     ]
 
+# Torchvision-backed objects
+if is_torchvision_available():
+    _import_structure["models.vit"] = ["ViTFeatureExtractor"]
 
 # Direct imports for type-checking
 if TYPE_CHECKING:
@@ -1427,7 +1432,6 @@
     from .models.vit import (
         VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, 
         ViTConfig, 
-        ViTFeatureExtractor,
     )
     from .models.wav2vec2 import (
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,

From 4c91fb329541e4a0ed88239b94794cc5bfbc8ee9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 19:37:58 +0100
Subject: [PATCH 23/44] Fix another bug with init

---
 src/transformers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f56a32e027e929..c24c8f35c0395c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1263,7 +1263,7 @@
 
 # Torchvision-backed objects
 if is_torchvision_available():
-    _import_structure["models.vit"] = ["ViTFeatureExtractor"]
+    _import_structure["models.vit"].extend(["ViTFeatureExtractor"])
 
 # Direct imports for type-checking
 if TYPE_CHECKING:

From c3dfbe6ad3ff46ab84864c42cb4a1c43e1686460 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 16 Mar 2021 19:41:57 +0100
Subject: [PATCH 24/44] Use append instead of extend

---
 src/transformers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c24c8f35c0395c..e17d7574903bf9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1263,7 +1263,7 @@
 
 # Torchvision-backed objects
 if is_torchvision_available():
-    _import_structure["models.vit"].extend(["ViTFeatureExtractor"])
+    _import_structure["models.vit"].append("ViTFeatureExtractor")
 
 # Direct imports for type-checking
 if TYPE_CHECKING:

From 5cd7dfdf4db7340ab906ef740ab91bea45a34ee1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 17 Mar 2021 11:23:40 +0100
Subject: [PATCH 25/44] Make all tests of ViTFeatureExtractor pass

---
 .circleci/config.yml                          |  16 +-
 setup.py                                      |   2 +
 src/transformers/file_utils.py                |   4 +-
 .../models/vit/feature_extraction_vit.py      |  27 ++--
 tests/test_feature_extraction_vit.py          | 145 ++++++++++--------
 tests/test_modeling_vit.py                    |  12 +-
 6 files changed, 112 insertions(+), 94 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index f8040e7553f7b5..4083be1cc2e58b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -79,8 +79,8 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
             - save_cache:
                 key: v0.4-{{ checksum "setup.py" }}
                 paths:
@@ -107,8 +107,8 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
             - save_cache:
                 key: v0.4-{{ checksum "setup.py" }}
                 paths:
@@ -135,8 +135,8 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
@@ -215,8 +215,8 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
             - save_cache:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
diff --git a/setup.py b/setup.py
index 0744058e661081..16eefa899145b3 100644
--- a/setup.py
+++ b/setup.py
@@ -130,6 +130,7 @@
     "tokenizers>=0.10.1,<0.11",
     "torch>=1.0",
     "torchaudio",
+    "torchvision",
     "tqdm>=4.27",
     "unidic>=1.0.2",
     "unidic_lite>=1.0.7",
@@ -225,6 +226,7 @@ def run(self):
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
 extras["speech"] = deps_list("soundfile", "torchaudio")
+extras["vision"] = deps_list("torchvision")
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index f26f566408bdcb..07b21dd4453269 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -183,7 +183,7 @@
     _soundfile_available = False
 
 
-_torchaudio_available = importlib.util.find_spec("torchaudio")
+_torchaudio_available = importlib.util.find_spec("torchaudio") is not None
 try:
     _torchaudio_version = importlib_metadata.version("torchaudio")
     logger.debug(f"Successfully imported torchaudio version {_torchaudio_version}")
@@ -191,7 +191,7 @@
     _torchaudio_available = False
 
 
-_torchvision_available = importlib.util.find_spec("torchvision")
+_torchvision_available = importlib.util.find_spec("torchvision") is not None
 try:
     _torchvision_version = importlib_metadata.version("torchvision")
     logger.debug(f"Successfully imported torchvision version {_torchvision_version}")
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index 88d80f0059e49a..7ac79dc6531d30 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -17,7 +17,7 @@
 from typing import List, Optional, Union
 
 import numpy as np
-import PIL
+from PIL import Image
 import torch
 from torchvision import transforms as T
 
@@ -45,7 +45,7 @@ class ViTFeatureExtractor(FeatureExtractionMixin):
             Whether or not to normalize the input with mean and standard deviation.
         do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int`, `optional`, defaults to :obj:`224`):
+        size (:obj:`int`, `optional`, defaults to :obj:`List[224, 224]`):
             Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`.
     """
 
@@ -57,7 +57,7 @@ def __init__(
         image_std=[0.229, 0.224, 0.225],
         do_normalize=True,
         do_resize=True,
-        size=224,
+        size=[224,224],
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -70,20 +70,21 @@ def __init__(
     def __call__(
         self,
         images: Union[
-            PIL.Image.Image, np.ndarray, torch.Tensor, List[PIL.Image.Image], List[np.ndarray], List[torch.Tensor]
+            Image.Image, np.ndarray, torch.Tensor, List[Image.Image], List[np.ndarray], List[torch.Tensor]
         ],
         **kwargs
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several image(s).
         Args:
-            images (:obj:`PIL.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, numpy array or a Torch
-                tensor.
+                tensor. In case of a numpy array/Torch tensor, each image should be of shape (C, H, W), where C is a number of channels, 
+                H and W are image height and width. 
         """
         # Input type checking for clearer error
         assert (
-            isinstance(images, PIL.Image.Image)
+            isinstance(images, Image.Image)
             or isinstance(images, np.ndarray)
             or isinstance(images, torch.Tensor)
             or (
@@ -92,7 +93,7 @@ def __call__(
                     and (
                         len(images) == 0
                         or (
-                            isinstance(images[0], PIL.Image.Image)
+                            isinstance(images[0], Image.Image)
                             or isinstance(images[0], np.ndarray)
                             or isinstance(images[0], torch.Tensor)
                         )
@@ -105,18 +106,20 @@ def __call__(
         )
 
         is_batched = bool(
-            isinstance(images, (list, tuple)) and (isinstance(images[0], (PIL.Image.Image, np.ndarray, torch.Tensor)))
+            isinstance(images, (list, tuple)) and (isinstance(images[0], (Image.Image, np.ndarray, torch.Tensor)))
         )
 
         # step 1: make images a list of PIL images no matter what
         if is_batched:
             if isinstance(images[0], np.ndarray):
-                images = [Image.fromarray(image).convert("RGB") for image in images]
+                # PIL expects the channel dimension as last dimension
+                images = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in images]
             elif isinstance(images[0], torch.Tensor):
                 images = [T.ToPILImage()(image).convert("RGB") for image in images]
         else:
             if isinstance(images, np.ndarray):
-                images = [Image.fromarray(images).convert("RGB")]
+                # PIL expects the channel dimension as last dimension
+                images = [Image.fromarray(np.moveaxis(images, 0, -1))]
             elif isinstance(images, torch.Tensor):
                 images = [T.ToPILImage()(images).convert("RGB")]
             else:
@@ -125,7 +128,7 @@ def __call__(
         # step 2: define transformations (resizing + normalization)
         transformations = []
         if self.do_resize and self.size is not None:
-            transformations.append(T.Resize(size=(self.size, self.size)))
+            transformations.append(T.Resize(size=self.size))
         if self.do_normalize:
             normalization = T.Compose([T.ToTensor(), T.Normalize(self.image_mean, self.image_std)])
             transformations.append(normalization)
diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index 888bc26ec3e94f..4d69138dfb7ddb 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -17,15 +17,22 @@
 import itertools
 import random
 import unittest
+import requests
 
 import numpy as np
-import torch
 
 from transformers import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, ViTConfig, ViTFeatureExtractor
-from transformers.testing_utils import require_torch, require_torchvision, slow
+from transformers.file_utils import is_torch_available, is_torchvision_available
+from transformers.testing_utils import require_torch, require_torchvision
 
 from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
 
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    from PIL import Image
+
 
 class ViTFeatureExtractionTester(unittest.TestCase):
     def __init__(
@@ -33,14 +40,14 @@ def __init__(
         parent,
         batch_size=7,
         num_channels=3,
-        image_size=224,
+        image_size=18,
         min_resolution=30,
         max_resolution=400,
         image_mean=[0.485, 0.456, 0.406],
         image_std=[0.5, 0.5, 0.5],
         do_normalize=True,
         do_resize=True,
-        size=18,
+        size=[18,18],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -54,7 +61,6 @@ def __init__(
         self.do_resize = do_resize
         self.size = size
 
-    @property
     def prepare_feat_extract_dict(self):
         return {
             "image_mean": self.image_mean,
@@ -63,22 +69,38 @@ def prepare_feat_extract_dict(self):
             "do_resize": self.do_resize,
             "size": self.size,
         }
-
-    def prepare_inputs_numpy(self, equal_resolution=False):
-        # TO DO 
-        input_size = (self.num_channels, self.image_size, self.image_size)
-        image_inputs = torch.randn((self.batch_size, *input_size))
-
-        return image_inputs
-
-    def prepare_inputs_pytorch(self, equal_resolution=False):
-        # TO DO
-        input_size = (self.num_channels, self.image_size, self.image_size)
-        image_inputs = torch.randn((self.batch_size, *input_size))
+    
+    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        """ This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+            or a list of PyTorch tensors if one specifies torchify=True.
+        """
+        
+        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+        
+        if equal_resolution:
+            image_inputs = [
+                np.random.randint(255, size=(self.num_channels, self.max_resolution, self.max_resolution),
+                                    dtype=np.uint8)
+                for i in range(self.batch_size)
+            ]
+        else:
+            image_inputs = []
+            for i in range(self.batch_size):
+                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
+                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
+
+        if not numpify and not torchify:
+            # PIL expects the channel dimension as last dimension
+            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        if torchify:
+            image_inputs = [torch.from_numpy(x) for x in image_inputs]
 
         return image_inputs
 
 
+@require_torchvision
+@require_torch
 class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
     feature_extraction_class = ViTFeatureExtractor
@@ -88,72 +110,61 @@ def setUp(self):
 
     @property
     def feat_extract_dict(self):
-        return self.feat_extract_tester.prepare_feat_extract_dict()
+        return self.feature_extract_tester.prepare_feat_extract_dict()
     
     def test_feat_extract_properties(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feat_extract, "image_mean"))
-        self.assertTrue(hasattr(feat_extract, "image_std"))
-        self.assertTrue(hasattr(feat_extract, "do_normalize"))
-        self.assertTrue(hasattr(feat_extract, "do_resize"))
-        self.assertTrue(hasattr(feat_extract, "size"))
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
 
     def test_batch_feature(self):
-        image_inputs = self.feat_extract_tester.prepare_inputs_for_common()
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        input_name = feat_extract.model_input_names[0]
-
-        processed_features = BatchFeature({input_name: image_inputs})
-
-        self.assertTrue(all(len(x) == len(y) for x, y in zip(image_inputs, processed_features[input_name])))
-
-        image_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
-        processed_features = BatchFeature({input_name: image_inputs}, tensor_type="np")
-
-        batch_features_input = processed_features[input_name]
+        pass
+    
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
 
-        if len(batch_features_input.shape) < 3:
-            batch_features_input = batch_features_input[:, :, None]
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0]).pixel_values
+        self.assertEqual(encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size))
 
-        self.assertTrue(
-            batch_features_input.shape
-            == (self.feat_extract_tester.batch_size, len(image_inputs[0]), self.feat_extract_tester.feature_size)
-        )
+        # Test batched
+        encoded_images = feature_extractor(image_inputs).pixel_values
+        self.assertEqual(encoded_images.shape, (self.feature_extract_tester.batch_size, self.feature_extract_tester.num_channels, 
+                            *self.feature_extract_tester.size))
     
     def test_call_numpy(self):
         # Initialize feature_extractor
-        feature_extract = self.feature_extraction_class(**self.feature_extract_tester.feat_extract_dict())
-        # create three inputs of resolution 800, 1000, and 1200
-        image_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_image_inputs = [np.asarray(speech_input) for speech_input in image_inputs]
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
 
         # Test not batched input
-        encoded_images_1 = feature_extractor(image_inputs[0], return_tensors="np").input_values
-        encoded_images_2 = feature_extractor(np_image_inputs[0], return_tensors="np").input_values
-        self.assertTrue(np.allclose(encoded_images_1, encoded_images_2, atol=1e-3))
-
+        encoded_images = feature_extractor(image_inputs[0]).pixel_values
+        self.assertEqual(encoded_images.shape, (1, self.feature_extract_tester.num_channels, 
+                            *self.feature_extract_tester.size))
+        
         # Test batched
-        encoded_images_1 = feature_extractor(image_inputs, return_tensors="np").input_values
-        encoded_images_2 = feature_extractor(np_image_inputs, return_tensors="np").input_values
-        for enc_seq_1, enc_seq_2 in zip(encoded_images_1, encoded_images_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+        encoded_images = feature_extractor(image_inputs).pixel_values
+        self.assertEqual(encoded_images.shape, (self.feature_extract_tester.batch_size, self.feature_extract_tester.num_channels, 
+                            *self.feature_extract_tester.size))
 
     def test_call_pytorch(self):
         # Initialize feature_extractor
-        feature_extract = self.feature_extraction_class(**self.feature_extract_tester.feat_extract_dict())
-        # create three inputs of resolution 800, 1000, and 1200
-        image_inputs = floats_tensor()
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
 
         # Test not batched input
-        encoded_images_1 = feature_extractor(image_inputs[0], return_tensors="pt").input_values
-        encoded_images_2 = feature_extractor(np_image_inputs[0], return_tensors="pt").input_values
-        self.assertTrue(np.allclose(encoded_images_1, encoded_images_2, atol=1e-3))
+        encoded_images = feature_extractor(image_inputs[0]).pixel_values
+        self.assertEqual(encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size))
 
         # Test batched
-        encoded_images_1 = feature_extractor(image_inputs, return_tensors="pt").input_values
-        encoded_images_2 = feature_extractor(np_image_inputs, return_tensors="pt").input_values
-        for enc_seq_1, enc_seq_2 in zip(encoded_images_1, encoded_images_2):
-            self.assertTrue(torch.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_normalization(self):
-        pass
+        encoded_images = feature_extractor(image_inputs).pixel_values
+        self.assertEqual(encoded_images.shape, (self.feature_extract_tester.batch_size, self.feature_extract_tester.num_channels, 
+                            *self.feature_extract_tester.size))
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 5cbe12be929b33..54167d69f8170b 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -18,12 +18,8 @@
 import unittest
 import inspect
 
-import torchvision
-import torchvision.transforms as T
-from PIL import Image
-
 import requests
-from transformers import is_torch_available
+from transformers.file_utils import is_torch_available, is_torchvision_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -39,6 +35,12 @@
     from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
+if is_torchvision_available():
+    import torchvision
+    import torchvision.transforms as T
+    from PIL import Image
+
+
 class ViTModelTester:
     def __init__(
         self,

From 9637b8561797041095942662bbcfa9ae97ccaf9c Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 17 Mar 2021 12:37:16 +0100
Subject: [PATCH 26/44] Improve model tests

---
 src/transformers/models/vit/modeling_vit.py |  6 +--
 tests/test_modeling_vit.py                  | 55 +++++++++++++++++++--
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 1eafcc8daf9523..c7bc02b25dd57a 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -122,13 +122,13 @@ def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768)
 
         self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
 
-    def forward(self, x):
-        B, C, H, W = x.shape
+    def forward(self, pixel_values):
+        B, C, H, W = pixel_values.shape
         # FIXME look at relaxing size constraints
         assert (
             H == self.image_size[0] and W == self.image_size[1]
         ), f"Input image size ({H}*{W}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
-        x = self.projection(x).flatten(2).transpose(1, 2)
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
         return x
 
 
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 54167d69f8170b..210eea959dde7c 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -111,8 +111,7 @@ def create_and_check_model(self, config, pixel_values, image_labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.patch_size**2 + 1, self.hidden_size))
 
     def create_and_check_for_image_classification(self, config, pixel_values, image_labels):
         config.num_labels = self.num_labels
@@ -135,7 +134,10 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class ViTModelTest(ModelTesterMixin, unittest.TestCase):
-
+    """
+        Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
+        attention_mask and seq_length. 
+    """
     all_model_classes = (
         (
             ViTModel,
@@ -161,6 +163,16 @@ def test_inputs_embeds(self):
         # ViT does not use inputs_embeds
         pass
     
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_patch_embeddings(), (torch.nn.Module))
+            model.set_patch_embeddings(torch.nn.Embedding(10, 10))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
+    
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -177,6 +189,41 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_length = self.model_tester.patch_size**2 + 1
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+    
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
@@ -189,7 +236,7 @@ def test_model_from_pretrained(self):
 
 
 # We will verify our results on an image of cute cats
-# TODO: use VitImageProcessor in the future
+# TODO: use ViTFeatureExtractor in the future
 def prepare_img(image_resolution):
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     im = Image.open(requests.get(url, stream=True).raw)

From 872ae16d87775fc9440fbd249a5a140a54b81664 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 17 Mar 2021 15:15:11 +0100
Subject: [PATCH 27/44] 24 model tests pass, 6 fail on cpu

---
 src/transformers/__init__.py                |  2 ++
 src/transformers/models/auto/__init__.py    |  2 ++
 src/transformers/models/vit/modeling_vit.py | 16 ++++++++--------
 tests/test_feature_extraction_vit.py        |  6 ++++++
 tests/test_modeling_common.py               |  2 ++
 tests/test_modeling_vit.py                  | 17 ++++++++---------
 6 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e17d7574903bf9..531d69051f8a41 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -460,6 +460,7 @@
     _import_structure["models.auto"].extend(
         [
             "MODEL_FOR_CAUSAL_LM_MAPPING",
+            "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_MASKED_LM_MAPPING",
             "MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
             "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
@@ -1615,6 +1616,7 @@
         )
         from .models.auto import (
             MODEL_FOR_CAUSAL_LM_MAPPING,
+            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_MASKED_LM_MAPPING,
             MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
             MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 0fd4e9041f3d65..1226a0b8370a59 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -29,6 +29,7 @@
 if is_torch_available():
     _import_structure["modeling_auto"] = [
         "MODEL_FOR_CAUSAL_LM_MAPPING",
+        "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_MASKED_LM_MAPPING",
         "MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
         "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
@@ -90,6 +91,7 @@
     if is_torch_available():
         from .modeling_auto import (
             MODEL_FOR_CAUSAL_LM_MAPPING,
+            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_MASKED_LM_MAPPING,
             MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
             MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index c7bc02b25dd57a..c44e04f7a85c0e 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -80,7 +80,6 @@ class ViTEmbeddings(nn.Module):
 
     Based on timm implementation, which can be found here:
     https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-
     """
 
     def __init__(self, config):
@@ -109,7 +108,11 @@ def forward(self, pixel_values):
 
 
 class PatchEmbeddings(nn.Module):
-    """ Image to Patch Embedding."""
+    """ Image to Patch Embedding.
+    
+        Based on timm implementation, which can be found here:
+        https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    """
 
     def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768):
         super().__init__()
@@ -123,11 +126,11 @@ def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768)
         self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
 
     def forward(self, pixel_values):
-        B, C, H, W = pixel_values.shape
+        batch_size, num_channels, height, width = pixel_values.shape
         # FIXME look at relaxing size constraints
         assert (
-            H == self.image_size[0] and W == self.image_size[1]
-        ), f"Input image size ({H}*{W}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            height == self.image_size[0] and width == self.image_size[1]
+        ), f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
         x = self.projection(pixel_values).flatten(2).transpose(1, 2)
         return x
 
@@ -499,9 +502,6 @@ def __init__(self, config):
     def get_patch_embeddings(self):
         return self.embeddings.patch_embeddings
 
-    def set_patch_embeddings(self, value):
-        self.embeddings.patch_embeddings = value
-
     def _prune_heads(self, heads_to_prune):
         """Prunes heads of the model.
         heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index 4d69138dfb7ddb..309692d9da6235 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -128,6 +128,8 @@ def test_call_pil(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random PIL images
         image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image) 
 
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
@@ -143,6 +145,8 @@ def test_call_numpy(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random numpy tensors
         image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray) 
 
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
@@ -159,6 +163,8 @@ def test_call_pytorch(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random PyTorch tensors
         image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor) 
 
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 96f5d505ad0aee..c59d64355d57c2 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -41,6 +41,7 @@
         MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
         MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
         AdaptiveEmbedding,
         BertConfig,
@@ -99,6 +100,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
             elif model_class in [
                 *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
                 *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
+                *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.values(),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 210eea959dde7c..e8c396c8a90f34 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -85,9 +85,9 @@ def __init__(
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
-        image_labels = None
+        labels = None
         if self.use_labels:
-            image_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
 
         config = ViTConfig(
             image_size=self.image_size,
@@ -104,21 +104,21 @@ def prepare_config_and_inputs(self):
             initializer_range=self.initializer_range,
         )
 
-        return config, pixel_values, image_labels
+        return config, pixel_values, labels
 
-    def create_and_check_model(self, config, pixel_values, image_labels):
+    def create_and_check_model(self, config, pixel_values, labels):
         model = ViTModel(config=config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.patch_size**2 + 1, self.hidden_size))
 
-    def create_and_check_for_image_classification(self, config, pixel_values, image_labels):
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.num_labels
         model = ViTForImageClassification(config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values, labels=image_labels)
+        result = model(pixel_values, labels=labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
     def prepare_config_and_inputs_for_common(self):
@@ -126,7 +126,7 @@ def prepare_config_and_inputs_for_common(self):
         (
             config,
             pixel_values,
-            image_labels,
+            labels,
         ) = config_and_inputs
         inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
@@ -164,12 +164,11 @@ def test_inputs_embeds(self):
         pass
     
     def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             model = model_class(config)
             self.assertIsInstance(model.get_patch_embeddings(), (torch.nn.Module))
-            model.set_patch_embeddings(torch.nn.Embedding(10, 10))
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
     

From a7a9e0e1db81b6da2ffa119007e4c60ecb298c88 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 17 Mar 2021 16:24:39 +0100
Subject: [PATCH 28/44] Minor fixes

---
 src/transformers/models/vit/modeling_vit.py |   4 +-
 tests/test_modeling_vit.py                  | 118 ++++++++++++++++++++
 2 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index c44e04f7a85c0e..01c683d1a063cd 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -459,8 +459,8 @@ def _init_weights(self, module):
     Args:
         pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it.
-            Pixel values can be obtained using :class:`~transformers.ViTImageProcessor`. See
-            :meth:`transformers.ViTImageProcessor.__call__` for details.
+            Pixel values can be obtained using :class:`~transformers.ViTFeatureExtractor`. See
+            :meth:`transformers.ViTFeatureExtractor.__call__` for details.
         
         attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Mask to avoid performing attention on padding pixel values. Mask values selected in ``[0, 1]``:
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index e8c396c8a90f34..2339719bb5310d 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -188,6 +188,124 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        # in ViT, the seq_len equal the square of the patch_size + 1 
+        patch_size = getattr(self.model_tester, "patch_size", None)
+        seq_len = patch_size**2 + 1
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 5
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Question Answering model returns start_logits and end_logits
+                if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)

From 466cef1a1ef2dd4dc494cd089a1f98318be37f8a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 19 Mar 2021 10:27:05 +0100
Subject: [PATCH 29/44] Improve tests

---
 src/transformers/models/vit/modeling_vit.py |  4 ++--
 tests/test_modeling_vit.py                  | 17 ++++++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 01c683d1a063cd..fa137dba2a545f 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -433,14 +433,14 @@ class ViTPreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         """ Initialize the weights """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
+        if isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
+        if isinstance(module, (nn.Linear, nn.Conv2d)) and module.bias is not None:
             module.bias.data.zero_()
 
 
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 2339719bb5310d..9f29d8de6d7431 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -79,7 +79,6 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
-        self.num_labels = num_labels
         self.scope = scope
 
     def prepare_config_and_inputs(self):
@@ -111,15 +110,17 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.patch_size**2 + 1, self.hidden_size))
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
 
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
+        config.num_labels = self.type_sequence_label_size
         model = ViTForImageClassification(config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -192,9 +193,11 @@ def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
 
-        # in ViT, the seq_len equal the square of the patch_size + 1 
-        patch_size = getattr(self.model_tester, "patch_size", None)
-        seq_len = patch_size**2 + 1
+        # in ViT, the seq_len equals the square of number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = self.model_tester.image_size
+        patch_size = self.model_tester.patch_size
+        num_patches = (image_size // patch_size) * (image_size // patch_size) 
+        seq_len = num_patches + 1
         decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
         decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)

From 647f0e4010d499df0e36c4ed06c9237db7f991df Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 19 Mar 2021 17:48:01 +0100
Subject: [PATCH 30/44] All tests are passing

---
 src/transformers/models/vit/modeling_vit.py |  2 +-
 tests/test_modeling_vit.py                  | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index fa137dba2a545f..e55871f9d1576a 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -499,7 +499,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    def get_patch_embeddings(self):
+    def get_input_embeddings(self):
         return self.embeddings.patch_embeddings
 
     def _prune_heads(self, heads_to_prune):
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 9f29d8de6d7431..dd7f7989ffb457 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -25,14 +25,12 @@
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 
-torch_device = 'cpu'
-
 
 if is_torch_available():
     import torch
 
     from transformers import ViTConfig, ViTForImageClassification, ViTModel
-    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple
 
 
 if is_torchvision_available():
@@ -111,6 +109,8 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.eval()
         result = model(pixel_values)
         # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = to_2tuple(self.image_size)
+        patch_size = to_2tuple(self.patch_size)
         num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
 
@@ -169,7 +169,7 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            self.assertIsInstance(model.get_patch_embeddings(), (torch.nn.Module))
+            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module))
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
     
@@ -194,9 +194,9 @@ def test_attention_outputs(self):
         config.return_dict = True
 
         # in ViT, the seq_len equals the square of number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = self.model_tester.image_size
-        patch_size = self.model_tester.patch_size
-        num_patches = (image_size // patch_size) * (image_size // patch_size) 
+        image_size = to_2tuple(self.model_tester.image_size)
+        patch_size = to_2tuple(self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
         seq_len = num_patches + 1
         decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
@@ -325,7 +325,11 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             )
             self.assertEqual(len(hidden_states), expected_num_layers)
 
-            seq_length = self.model_tester.patch_size**2 + 1
+            # ViT has a different seq_length
+            image_size = to_2tuple(self.model_tester.image_size)
+            patch_size = to_2tuple(self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = num_patches + 1
 
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),

From e01294c643b6f198dc61b4b3dcfddc3f17461cad Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 19 Mar 2021 20:58:07 +0100
Subject: [PATCH 31/44] Make style & quality, docs improvements

---
 README.md                                     |  1 +
 docs/source/index.rst                         |  7 ++
 docs/source/model_doc/vit.rst                 |  2 +-
 src/transformers/__init__.py                  |  7 +-
 src/transformers/dependency_versions_table.py |  1 +
 src/transformers/models/auto/modeling_auto.py |  5 +-
 src/transformers/models/vit/__init__.py       |  2 +-
 .../models/vit/configuration_vit.py           |  3 +-
 .../models/vit/convert_vit_timm_to_pytorch.py |  3 -
 .../models/vit/feature_extraction_vit.py      | 24 +++---
 src/transformers/models/vit/modeling_vit.py   | 79 +++++++++++-------
 src/transformers/utils/dummy_pt_objects.py    | 34 ++++++++
 tests/test_feature_extraction_vit.py          | 82 ++++++++++++-------
 tests/test_modeling_common.py                 |  2 +-
 tests/test_modeling_vit.py                    | 53 ++----------
 15 files changed, 172 insertions(+), 133 deletions(-)

diff --git a/README.md b/README.md
index de2917c9a23855..18b313e06c38ba 100644
--- a/README.md
+++ b/README.md
@@ -238,6 +238,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 
 To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e069b997e8140a..a15cc7885fd99f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -223,6 +223,10 @@ and conversion utilities for the following models:
 47. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
     Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
     Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+48. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
+    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
+    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 
 
 .. _bigtable:
@@ -319,6 +323,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
@@ -448,6 +454,7 @@ TensorFlow and/or Flax.
     model_doc/t5
     model_doc/tapas
     model_doc/transformerxl
+    model_doc/vit
     model_doc/wav2vec2
     model_doc/xlm
     model_doc/xlmprophetnet
diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
index 533c47c9a61ff2..337fe1dd86a4f8 100644
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -59,7 +59,7 @@ ViTFeatureExtractor
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.ViTFeatureExtractor
-    :members: 
+    :members: __call__
 
 
 ViTModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 531d69051f8a41..eebf70c53a625f 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1430,10 +1430,7 @@
         TransfoXLCorpus,
         TransfoXLTokenizer,
     )
-    from .models.vit import (
-        VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, 
-        ViTConfig, 
-    )
+    from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
     from .models.wav2vec2 import (
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Wav2Vec2Config,
@@ -1556,7 +1553,7 @@
 
     if is_torchvision_available():
         from .models.vit import ViTFeatureExtractor
-    
+
     # Modeling
     if is_torch_available():
 
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 8e0f3773e940f7..25f0adb476cf46 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -49,6 +49,7 @@
     "tokenizers": "tokenizers>=0.10.1,<0.11",
     "torch": "torch>=1.0",
     "torchaudio": "torchaudio",
+    "torchvision": "torchvision",
     "tqdm": "tqdm>=4.27",
     "unidic": "unidic>=1.0.2",
     "unidic_lite": "unidic_lite>=1.0.7",
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 692d2c42a11799..3c8f7bdcb07d00 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -226,10 +226,7 @@
     TapasModel,
 )
 from ..transfo_xl.modeling_transfo_xl import TransfoXLForSequenceClassification, TransfoXLLMHeadModel, TransfoXLModel
-from ..vit.modeling_vit import (
-    ViTForImageClassification,
-    ViTModel,
-)
+from ..vit.modeling_vit import ViTForImageClassification, ViTModel
 from ..wav2vec2.modeling_wav2vec2 import Wav2Vec2ForMaskedLM, Wav2Vec2Model
 from ..xlm.modeling_xlm import (
     XLMForMultipleChoice,
diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
index 31333adfd718bc..c155831ef1d1ff 100644
--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -42,7 +42,7 @@
 
     if is_torchvision_available():
         from .feature_extraction_vit import ViTFeatureExtractor
-    
+
     if is_torch_available():
         from .modeling_vit import (
             VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index 85bc8997f50d98..c967198fb5b9a4 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -82,7 +82,8 @@ class ViTConfig(PretrainedConfig):
         num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
             The number of input channels.
 
-        Example::
+    
+    Example::
 
         >>> from transformers import ViTModel, ViTConfig
 
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index 04be8fd53fcd80..aa822ee9a9a4c4 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -16,14 +16,11 @@
 
 
 import argparse
-import os
 from pathlib import Path
 
 import torch
 import torchvision.transforms as T
-from packaging import version
 from PIL import Image
-from torch import nn
 
 import requests
 import timm
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index 7ac79dc6531d30..be9018744145a7 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -14,14 +14,13 @@
 # limitations under the License.
 """Feature extractor class for ViT."""
 
-from typing import List, Optional, Union
+from typing import List, Union
 
 import numpy as np
-from PIL import Image
 import torch
+from PIL import Image
 from torchvision import transforms as T
 
-from ...file_utils import PaddingStrategy, TensorType
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from ...utils import logging
 
@@ -31,11 +30,11 @@
 
 class ViTFeatureExtractor(FeatureExtractionMixin):
     r"""
-    Constructs a ViT feature extractor. 
-    
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main methods. 
+    Constructs a ViT feature extractor.
+
+    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
-    
+
     Args:
         image_mean (:obj:`int`, defaults to [0.485, 0.456, 0.406]):
             The sequence of means for each channel, to be used when normalizing images.
@@ -57,7 +56,7 @@ def __init__(
         image_std=[0.229, 0.224, 0.225],
         do_normalize=True,
         do_resize=True,
-        size=[224,224],
+        size=[224, 224],
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -69,18 +68,17 @@ def __init__(
 
     def __call__(
         self,
-        images: Union[
-            Image.Image, np.ndarray, torch.Tensor, List[Image.Image], List[np.ndarray], List[torch.Tensor]
-        ],
+        images: Union[Image.Image, np.ndarray, torch.Tensor, List[Image.Image], List[np.ndarray], List[torch.Tensor]],
         **kwargs
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several image(s).
+        
         Args:
             images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, numpy array or a Torch
-                tensor. In case of a numpy array/Torch tensor, each image should be of shape (C, H, W), where C is a number of channels, 
-                H and W are image height and width. 
+                tensor. In case of a numpy array/Torch tensor, each image should be of shape (C, H, W), where C is a number of channels,
+                H and W are image height and width.
         """
         # Input type checking for clearer error
         assert (
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index e55871f9d1576a..a3eb5c9e55d0eb 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -15,8 +15,9 @@
 """ PyTorch ViT model. """
 
 
+import collections.abc
 import math
-import os
+from itertools import repeat
 
 import torch
 import torch.utils.checkpoint
@@ -33,7 +34,6 @@
 from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
 from ...modeling_utils import (
     PreTrainedModel,
-    SequenceSummary,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
     prune_linear_layer,
@@ -53,13 +53,6 @@
 ]
 
 
-""" Layer/Module Helpers
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import collections.abc
-from itertools import repeat
-
-
 # Copied from
 # https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
 # From PyTorch internals
@@ -108,10 +101,10 @@ def forward(self, pixel_values):
 
 
 class PatchEmbeddings(nn.Module):
-    """ Image to Patch Embedding.
-    
-        Based on timm implementation, which can be found here:
-        https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    """Image to Patch Embedding.
+
+    Based on timm implementation, which can be found here:
+    https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
     """
 
     def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768):
@@ -379,7 +372,7 @@ def forward(
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
-                        return module(*inputs, past_key_value, output_attentions)
+                        return module(*inputs, output_attentions)
 
                     return custom_forward
 
@@ -461,7 +454,7 @@ def _init_weights(self, module):
             Pixel values. Padding will be ignored by default should you provide it.
             Pixel values can be obtained using :class:`~transformers.ViTFeatureExtractor`. See
             :meth:`transformers.ViTFeatureExtractor.__call__` for details.
-        
+
         attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Mask to avoid performing attention on padding pixel values. Mask values selected in ``[0, 1]``:
             - 1 for pixels that are real (i.e. **not masked**),
@@ -470,10 +463,10 @@ def _init_weights(self, module):
 
         head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            
+
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
-            
+
         output_attentions (:obj:`bool`, `optional`):
             Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
             tensors for more detail.
@@ -511,12 +504,7 @@ def _prune_heads(self, heads_to_prune):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="vit-base-patch16-224",
-        output_type=BaseModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values=None,
@@ -526,6 +514,25 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        Returns:
+
+        Examples::
+
+            >>> from transformers import ViTFeatureExtractor, ViTModel
+            >>> from PIL import Image
+            >>> import requests
+
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+
+            >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
+            >>> model = ViTModel.from_pretrained('google/vit-base-patch16-224')
+            
+            >>> inputs = feature_extractor(images=image)
+            >>> outputs = model(**inputs)
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -604,12 +611,7 @@ def __init__(self, config):
         self.init_weights()
 
     @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    # @add_code_sample_docstrings(
-    #     tokenizer_class=_TOKENIZER_FOR_DOC,
-    #     checkpoint="vit-base-patch16-224",
-    #     output_type=SequenceClassifierOutput,
-    #     config_class=_CONFIG_FOR_DOC,
-    # )
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values=None,
@@ -626,6 +628,25 @@ def forward(
             Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
             If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
             If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples::
+
+            >>> from transformers import ViTFeatureExtractor, ViTForImageClassification
+            >>> from PIL import Image
+            >>> import requests
+
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+
+            >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
+            >>> model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
+            
+            >>> inputs = feature_extractor(images=image)
+            >>> outputs = model(**inputs)
+            >>> logits = outputs.logits
+            >>> predicted_class = logits.argmax(-1).item()
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index d5ddcd2e3c769c..8e36c9c3191b37 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -287,6 +287,9 @@ def load_tf_weights_in_albert(*args, **kwargs):
 MODEL_FOR_CAUSAL_LM_MAPPING = None
 
 
+MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
+
+
 MODEL_FOR_MASKED_LM_MAPPING = None
 
 
@@ -2382,6 +2385,37 @@ def load_tf_weights_in_transfo_xl(*args, **kwargs):
     requires_pytorch(load_tf_weights_in_transfo_xl)
 
 
+VIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class ViTForImageClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class ViTLayer:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class ViTModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class ViTPreTrainedModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index 309692d9da6235..713b9a7f824eb2 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -14,19 +14,17 @@
 # limitations under the License.
 
 
-import itertools
-import random
 import unittest
-import requests
 
 import numpy as np
 
-from transformers import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, ViTConfig, ViTFeatureExtractor
+from transformers import ViTFeatureExtractor
 from transformers.file_utils import is_torch_available, is_torchvision_available
 from transformers.testing_utils import require_torch, require_torchvision
 
 from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
 
+
 if is_torch_available():
     import torch
 
@@ -47,7 +45,7 @@ def __init__(
         image_std=[0.5, 0.5, 0.5],
         do_normalize=True,
         do_resize=True,
-        size=[18,18],
+        size=[18, 18],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -69,18 +67,19 @@ def prepare_feat_extract_dict(self):
             "do_resize": self.do_resize,
             "size": self.size,
         }
-    
+
     def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        """ This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-            or a list of PyTorch tensors if one specifies torchify=True.
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
         """
-        
+
         assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-        
+
         if equal_resolution:
             image_inputs = [
-                np.random.randint(255, size=(self.num_channels, self.max_resolution, self.max_resolution),
-                                    dtype=np.uint8)
+                np.random.randint(
+                    255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
+                )
                 for i in range(self.batch_size)
             ]
         else:
@@ -104,14 +103,14 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
 class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
     feature_extraction_class = ViTFeatureExtractor
-    
+
     def setUp(self):
         self.feature_extract_tester = ViTFeatureExtractionTester(self)
 
     @property
     def feat_extract_dict(self):
         return self.feature_extract_tester.prepare_feat_extract_dict()
-    
+
     def test_feat_extract_properties(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         self.assertTrue(hasattr(feature_extractor, "image_mean"))
@@ -122,41 +121,56 @@ def test_feat_extract_properties(self):
 
     def test_batch_feature(self):
         pass
-    
+
     def test_call_pil(self):
         # Initialize feature_extractor
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random PIL images
         image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
         for image in image_inputs:
-            self.assertIsInstance(image, Image.Image) 
+            self.assertIsInstance(image, Image.Image)
 
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
-        self.assertEqual(encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size))
+        self.assertEqual(
+            encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size)
+        )
 
         # Test batched
         encoded_images = feature_extractor(image_inputs).pixel_values
-        self.assertEqual(encoded_images.shape, (self.feature_extract_tester.batch_size, self.feature_extract_tester.num_channels, 
-                            *self.feature_extract_tester.size))
-    
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                *self.feature_extract_tester.size,
+            ),
+        )
+
     def test_call_numpy(self):
         # Initialize feature_extractor
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
         # create random numpy tensors
         image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
         for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray) 
+            self.assertIsInstance(image, np.ndarray)
 
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
-        self.assertEqual(encoded_images.shape, (1, self.feature_extract_tester.num_channels, 
-                            *self.feature_extract_tester.size))
-        
+        self.assertEqual(
+            encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size)
+        )
+
         # Test batched
         encoded_images = feature_extractor(image_inputs).pixel_values
-        self.assertEqual(encoded_images.shape, (self.feature_extract_tester.batch_size, self.feature_extract_tester.num_channels, 
-                            *self.feature_extract_tester.size))
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                *self.feature_extract_tester.size,
+            ),
+        )
 
     def test_call_pytorch(self):
         # Initialize feature_extractor
@@ -164,13 +178,21 @@ def test_call_pytorch(self):
         # create random PyTorch tensors
         image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
         for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor) 
+            self.assertIsInstance(image, torch.Tensor)
 
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
-        self.assertEqual(encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size))
+        self.assertEqual(
+            encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size)
+        )
 
         # Test batched
         encoded_images = feature_extractor(image_inputs).pixel_values
-        self.assertEqual(encoded_images.shape, (self.feature_extract_tester.batch_size, self.feature_extract_tester.num_channels, 
-                            *self.feature_extract_tester.size))
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                *self.feature_extract_tester.size,
+            ),
+        )
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index c59d64355d57c2..792b7f89d61393 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -34,6 +34,7 @@
     from transformers import (
         BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
         MODEL_FOR_CAUSAL_LM_MAPPING,
+        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
         MODEL_FOR_MASKED_LM_MAPPING,
         MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
         MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
@@ -41,7 +42,6 @@
         MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
         MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
         AdaptiveEmbedding,
         BertConfig,
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index dd7f7989ffb457..778693ba7d720b 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -15,8 +15,8 @@
 """ Testing suite for the PyTorch ViT model. """
 
 
-import unittest
 import inspect
+import unittest
 
 import requests
 from transformers.file_utils import is_torch_available, is_torchvision_available
@@ -34,7 +34,6 @@
 
 
 if is_torchvision_available():
-    import torchvision
     import torchvision.transforms as T
     from PIL import Image
 
@@ -136,9 +135,10 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class ViTModelTest(ModelTesterMixin, unittest.TestCase):
     """
-        Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
-        attention_mask and seq_length. 
+    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
     """
+
     all_model_classes = (
         (
             ViTModel,
@@ -163,7 +163,7 @@ def test_config(self):
     def test_inputs_embeds(self):
         # ViT does not use inputs_embeds
         pass
-    
+
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -172,7 +172,7 @@ def test_model_common_attributes(self):
             self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module))
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
-    
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -184,7 +184,7 @@ def test_forward_signature(self):
 
             expected_arg_names = ["pixel_values"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
-    
+
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
@@ -241,42 +241,6 @@ def test_attention_outputs(self):
                 )
             out_len = len(outputs)
 
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                # Question Answering model returns start_logits and end_logits
-                if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
-                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
             # Check attention is always last and order is fine
             inputs_dict["output_attentions"] = True
             inputs_dict["output_hidden_states"] = True
@@ -308,7 +272,6 @@ def test_attention_outputs(self):
                     [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                 )
 
-    
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -347,7 +310,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             config.output_hidden_states = True
 
             check_hidden_states_output(inputs_dict, config, model_class)
-    
+
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)

From 0e02f645be2a7b7c950159b02a471a67daca4481 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Sat, 20 Mar 2021 11:00:22 +0100
Subject: [PATCH 32/44] Remove attention mask, add support for head mask

---
 docs/source/model_doc/vit.rst                 |  47 ++++---
 .../models/vit/configuration_vit.py           |  30 ++---
 .../models/vit/feature_extraction_vit.py      |  10 +-
 src/transformers/models/vit/modeling_vit.py   | 124 ++++++------------
 tests/test_modeling_vit.py                    |   4 +-
 5 files changed, 86 insertions(+), 129 deletions(-)

diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
index 337fe1dd86a4f8..8d2eda7050af49 100644
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -16,36 +16,41 @@ Vision Transformer (ViT)
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The Vision Transformer (ViT) model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
-<https://arxiv.org/abs/2010.11929>`__  by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, 
-Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. It's the 
-first paper that successfully trains a Transformer encoder on ImageNet, attaining very good results compared to familiar convolutional
-architectures. 
+The Vision Transformer (ViT) model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition
+at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
+Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
+Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
+very good results compared to familiar convolutional architectures.
 
 
 The abstract from the paper is the following:
 
-*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to 
-computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace 
-certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not 
-necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. 
-When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, 
-CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while 
-requiring substantially fewer computational resources to train.*
+*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
+applications to computer vision remain limited. In vision, attention is either applied in conjunction with
+convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
+structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
+sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
+data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
+Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
+substantially fewer computational resources to train.*
 
 Tips:
 
-- To feed images to the Transformer encoder, each image is split into fixed-size patches, which are then linearly embedded. The authors
-  also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
-- The Vision Transformer expects each image to be of the same size (resolution), either 224x224 or 384x384 depending on the checkpoint.
-  One can use :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model. 
-- Both the expected image resolution and patch resolution are reflected in the name of each checkpoint. For example, :obj:`google/vit-base-patch16-224`
-  refers to a base architecture with image resolution 224x224 and patch resolution of 16x16. All checkpoints can be found on the `hub <https://huggingface.co/models?search=vit>`__.
+- To feed images to the Transformer encoder, each image is split into fixed-size patches, which are then linearly
+  embedded. The authors also add absolute position embeddings, and feed the resulting sequence of vectors to a standard
+  Transformer encoder.
+- The Vision Transformer expects each image to be of the same size (resolution), either 224x224 or 384x384 depending on
+  the checkpoint. One can use :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images
+  for the model.
+- Both the expected image resolution and patch resolution are reflected in the name of each checkpoint. For example,
+  :obj:`google/vit-base-patch16-224` refers to a base architecture with image resolution 224x224 and patch resolution
+  of 16x16. All checkpoints can be found on the `hub <https://huggingface.co/models?search=vit>`__.
 
 The original code (written in JAX) can be found `here <https://github.com/google-research/vision_transformer>`__.
 
-Note that we converted the weights from Ross Wightman's `timm library <https://github.com/rwightman/pytorch-image-models>`__, who already converted 
-the weights from JAX to PyTorch. Credits go to him!
+Note that we converted the weights from Ross Wightman's `timm library
+<https://github.com/rwightman/pytorch-image-models>`__, who already converted the weights from JAX to PyTorch. Credits
+go to him!
 
 
 ViTConfig
@@ -73,4 +78,4 @@ ViTForImageClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.ViTForImageClassification
-    :members: forward
\ No newline at end of file
+    :members: forward
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index c967198fb5b9a4..09e62e2abecb4d 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -28,23 +28,21 @@
 
 class ViTConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ViTModel`.
-    It is used to instantiate an ViT model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the ViT `vit-base-patch16-224 <https://huggingface.co/vit-base-patch16-224>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.ViTModel`. It is used to
+    instantiate an ViT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ViT `vit-base-patch16-224
+    <https://huggingface.co/vit-base-patch16-224>`__ architecture.
 
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-    for more information.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
 
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the ViT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.ViTModel` or
-            :class:`~transformers.TFViTModel`.
-            Vocabulary size of the  model. Defines the different tokens that
-            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ViTModel`.
+            :obj:`inputs_ids` passed when calling :class:`~transformers.ViTModel` or :class:`~transformers.TFViTModel`.
+            Vocabulary size of the model. Defines the different tokens that can be represented by the `inputs_ids`
+            passed to the forward method of :class:`~transformers.ViTModel`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
@@ -54,15 +52,15 @@ class ViTConfig(PretrainedConfig):
         intermediate_size (:obj:`int`, `optional`, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
             The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ViTModel` or
             :class:`~transformers.TFViTModel`.
@@ -82,7 +80,7 @@ class ViTConfig(PretrainedConfig):
         num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
             The number of input channels.
 
-    
+
     Example::
 
         >>> from transformers import ViTModel, ViTConfig
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index be9018744145a7..f9ec7007a871d2 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -32,8 +32,8 @@ class ViTFeatureExtractor(FeatureExtractionMixin):
     r"""
     Constructs a ViT feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main methods.
-    Users should refer to this superclass for more information regarding those methods.
+    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
         image_mean (:obj:`int`, defaults to [0.485, 0.456, 0.406]):
@@ -73,12 +73,12 @@ def __call__(
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several image(s).
-        
+
         Args:
             images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, numpy array or a Torch
-                tensor. In case of a numpy array/Torch tensor, each image should be of shape (C, H, W), where C is a number of channels,
-                H and W are image height and width.
+                tensor. In case of a numpy array/Torch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
         """
         # Input type checking for clearer error
         assert (
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index a3eb5c9e55d0eb..387191a664d979 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -25,19 +25,9 @@
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
-from ...modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
 from .configuration_vit import ViTConfig
 
@@ -45,7 +35,6 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "ViTConfig"
-_TOKENIZER_FOR_DOC = "ViTTokenizer"
 
 VIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "nielsr/vit-base-patch16-224",
@@ -69,7 +58,8 @@ def parse(x):
 
 
 class ViTEmbeddings(nn.Module):
-    """Construct the cls token, position and patch embeddings.
+    """
+    Construct the cls token, position and patch embeddings.
 
     Based on timm implementation, which can be found here:
     https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
@@ -101,7 +91,8 @@ def forward(self, pixel_values):
 
 
 class PatchEmbeddings(nn.Module):
-    """Image to Patch Embedding.
+    """
+    Image to Patch Embedding.
 
     Based on timm implementation, which can be found here:
     https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
@@ -155,7 +146,6 @@ def transpose_for_scores(self, x):
     def forward(
         self,
         hidden_states,
-        attention_mask=None,
         head_mask=None,
         output_attentions=False,
     ):
@@ -169,9 +159,6 @@ def forward(
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in ViTModel forward() function)
-            attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -196,8 +183,10 @@ def forward(
 
 
 class ViTSelfOutput(nn.Module):
-    """The residual connection is defined in VitLayer instead of here (as is the case with other models),
-    due to the layernorm applied before each block."""
+    """
+    The residual connection is defined in VitLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
 
     def __init__(self, config):
         super().__init__()
@@ -240,13 +229,11 @@ def prune_heads(self, heads):
     def forward(
         self,
         hidden_states,
-        attention_mask=None,
         head_mask=None,
         output_attentions=False,
     ):
         self_outputs = self.self(
             hidden_states,
-            attention_mask,
             head_mask,
             output_attentions,
         )
@@ -305,13 +292,11 @@ def __init__(self, config):
     def forward(
         self,
         hidden_states,
-        attention_mask=None,
         head_mask=None,
         output_attentions=False,
     ):
         self_attention_outputs = self.attention(
             self.layernorm_before(hidden_states),  # in ViT, layernorm is applied before self-attention
-            attention_mask,
             head_mask,
             output_attentions=output_attentions,
         )
@@ -353,7 +338,6 @@ def __init__(self, config):
     def forward(
         self,
         hidden_states,
-        attention_mask=None,
         head_mask=None,
         output_attentions=False,
         output_hidden_states=False,
@@ -379,13 +363,11 @@ def custom_forward(*inputs):
                 layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
-                    attention_mask,
                     layer_head_mask,
                 )
             else:
                 layer_outputs = layer_module(
                     hidden_states,
-                    attention_mask,
                     layer_head_mask,
                     output_attentions,
                 )
@@ -417,8 +399,8 @@ def custom_forward(*inputs):
 
 class ViTPreTrainedModel(PreTrainedModel):
     """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
     """
 
     config_class = ViTConfig
@@ -438,28 +420,23 @@ def _init_weights(self, module):
 
 
 VIT_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
 
     Parameters:
         config (:class:`~transformers.ViTConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 
 VIT_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it.
-            Pixel values can be obtained using :class:`~transformers.ViTFeatureExtractor`. See
-            :meth:`transformers.ViTFeatureExtractor.__call__` for details.
-
-        attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding pixel values. Mask values selected in ``[0, 1]``:
-            - 1 for pixels that are real (i.e. **not masked**),
-            - 0 for pixels that are padding (i.e. **masked**).
-            `What are attention masks? <../glossary.html#attention-mask>`__
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            :class:`~transformers.ViTFeatureExtractor`. See :meth:`transformers.ViTFeatureExtractor.__call__` for
+            details.
 
         head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
@@ -496,9 +473,9 @@ def get_input_embeddings(self):
         return self.embeddings.patch_embeddings
 
     def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        See base class PreTrainedModel
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -508,7 +485,6 @@ def _prune_heads(self, heads_to_prune):
     def forward(
         self,
         pixel_values=None,
-        attention_mask=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -528,7 +504,7 @@ def forward(
 
             >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
             >>> model = ViTModel.from_pretrained('google/vit-base-patch16-224')
-            
+
             >>> inputs = feature_extractor(images=image)
             >>> outputs = model(**inputs)
             >>> last_hidden_states = outputs.last_hidden_state
@@ -542,32 +518,12 @@ def forward(
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
-        # if input_ids is not None and inputs_embeds is not None:
-        #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        # elif input_ids is not None:
-        #     input_shape = input_ids.size()
-        #     batch_size, seq_length = input_shape
-        # elif inputs_embeds is not None:
-        #     input_shape = inputs_embeds.size()[:-1]
-        #     batch_size, seq_length = input_shape
-        # else:
-        #     raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        # device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # if attention_mask is None:
-        #     attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-
-        # # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # # ourselves in which case we just need to make it broadcastable to all heads.
-        # extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
-
-        # # Prepare head mask if needed
-        # # 1.0 in head_mask indicate we keep the head
-        # # attention_probs has shape bsz x n_heads x N x N
-        # # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        # head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
         embedding_output = self.embeddings(
             pixel_values,
@@ -575,7 +531,6 @@ def forward(
 
         encoder_outputs = self.encoder(
             embedding_output,
-            attention_mask=None,  # replaced extended_attention_mask
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -594,8 +549,10 @@ def forward(
 
 
 @add_start_docstrings(
-    """ViT Model transformer with an image classification head on top (a linear layer on top of
-    the pooled output) e.g. for ImageNet. """,
+    """
+    ViT Model transformer with an image classification head on top (a linear layer on top of the pooled output) e.g.
+    for ImageNet.
+    """,
     VIT_START_DOCSTRING,
 )
 class ViTForImageClassification(ViTPreTrainedModel):
@@ -615,7 +572,6 @@ def __init__(self, config):
     def forward(
         self,
         pixel_values=None,
-        attention_mask=None,
         head_mask=None,
         labels=None,
         output_attentions=None,
@@ -624,9 +580,8 @@ def forward(
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the image classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
             If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 
         Returns:
@@ -634,6 +589,7 @@ def forward(
         Examples::
 
             >>> from transformers import ViTFeatureExtractor, ViTForImageClassification
+            >>> from datasets import load_dataset
             >>> from PIL import Image
             >>> import requests
 
@@ -642,17 +598,17 @@ def forward(
 
             >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
             >>> model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
-            
+
             >>> inputs = feature_extractor(images=image)
             >>> outputs = model(**inputs)
             >>> logits = outputs.logits
+            >>> # model predicts one of the 1000 ImageNet classes
             >>> predicted_class = logits.argmax(-1).item()
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.vit(
             pixel_values,
-            attention_mask=attention_mask,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 778693ba7d720b..e33f350bd4a4c1 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -193,14 +193,12 @@ def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
 
-        # in ViT, the seq_len equals the square of number of patches + 1 (we add 1 for the [CLS] token)
+        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
         image_size = to_2tuple(self.model_tester.image_size)
         patch_size = to_2tuple(self.model_tester.patch_size)
         num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
         seq_len = num_patches + 1
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
         chunk_length = getattr(self.model_tester, "chunk_length", None)
         if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):

From f5ba2f4b75bf3b1f2d4b13066e31811964df633f Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Sun, 21 Mar 2021 11:04:55 +0100
Subject: [PATCH 33/44] Some docs improvements + clearer input checking for
 ViTFeatureExtractor

---
 docs/source/model_doc/vit.rst                 | 31 ++++++++++++-----
 .../models/vit/convert_vit_timm_to_pytorch.py |  2 +-
 .../models/vit/feature_extraction_vit.py      | 33 ++++++++-----------
 src/transformers/models/vit/modeling_vit.py   |  2 +-
 4 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
index 8d2eda7050af49..9e98136f06a817 100644
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -36,15 +36,28 @@ substantially fewer computational resources to train.*
 
 Tips:
 
-- To feed images to the Transformer encoder, each image is split into fixed-size patches, which are then linearly
-  embedded. The authors also add absolute position embeddings, and feed the resulting sequence of vectors to a standard
-  Transformer encoder.
-- The Vision Transformer expects each image to be of the same size (resolution), either 224x224 or 384x384 depending on
-  the checkpoint. One can use :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images
-  for the model.
-- Both the expected image resolution and patch resolution are reflected in the name of each checkpoint. For example,
-  :obj:`google/vit-base-patch16-224` refers to a base architecture with image resolution 224x224 and patch resolution
-  of 16x16. All checkpoints can be found on the `hub <https://huggingface.co/models?search=vit>`__.
+- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size patches, which are then
+  linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be used for
+  classification. The authors also add absolute position embeddings, and feed the resulting sequence of vectors to a
+  standard Transformer encoder.
+- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
+  use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
+  et al., 2020) <https://arxiv.org/abs/1912.11370>`__. The authors report the best results with a resolution of 384x384
+  during fine-tuning.
+- As the Vision Transformer expects each image to be of the same size (resolution), one can use
+  :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during fine-tuning are reflected in the name of each checkpoint.
+  For example, :obj:`google/vit-base-patch16-224` refers to a base architecture with patch resolution of 16x16 and
+  fine-tuning resolution of 224x224. All checkpoints can be found on the `hub
+  <https://huggingface.co/models?search=vit>`__.
+- The available checkpoints are pre-trained on `ImageNet-21k <http://www.image-net.org/>`__ (a collection of 14 million
+  images and 21k classes), and then fine-tuned on `ImageNet <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also
+  referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes).
+- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors did also
+  experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
+  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
+  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
+
 
 The original code (written in JAX) can be found `here <https://github.com/google-research/vision_transformer>`__.
 
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index aa822ee9a9a4c4..da69156c7dfa81 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -24,7 +24,7 @@
 
 import requests
 import timm
-from transformers import ViTConfig, ViTForImageClassification, ViTModel
+from transformers import ViTConfig, ViTForImageClassification
 from transformers.utils import logging
 
 
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index f9ec7007a871d2..b33c6f546adc15 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -81,27 +81,20 @@ def __call__(
                 number of channels, H and W are image height and width.
         """
         # Input type checking for clearer error
-        assert (
-            isinstance(images, Image.Image)
-            or isinstance(images, np.ndarray)
-            or isinstance(images, torch.Tensor)
-            or (
-                (
-                    isinstance(images, (list, tuple))
-                    and (
-                        len(images) == 0
-                        or (
-                            isinstance(images[0], Image.Image)
-                            or isinstance(images[0], np.ndarray)
-                            or isinstance(images[0], torch.Tensor)
-                        )
-                    )
-                )
+        valid_images = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray, torch.Tensor)):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray, torch.Tensor)):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
             )
-        ), (
-            "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
-            "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
-        )
 
         is_batched = bool(
             isinstance(images, (list, tuple)) and (isinstance(images[0], (Image.Image, np.ndarray, torch.Tensor)))
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 387191a664d979..164e6f258dac95 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -83,7 +83,7 @@ def forward(self, pixel_values):
         batch_size = pixel_values.shape[0]
         embeddings = self.patch_embeddings(pixel_values)
 
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
         embeddings = embeddings + self.position_embeddings
         embeddings = self.dropout(embeddings)

From 852b7771357da33c39f4bd591090f226ff17abf6 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Sun, 21 Mar 2021 12:25:35 +0100
Subject: [PATCH 34/44] Change normalization to match original implementation

---
 .../models/vit/feature_extraction_vit.py      | 14 +++----
 tests/test_feature_extraction_vit.py          | 24 +++++++-----
 tests/test_modeling_vit.py                    | 39 ++++++++-----------
 3 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index b33c6f546adc15..6b74814bd05cb3 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -36,15 +36,15 @@ class ViTFeatureExtractor(FeatureExtractionMixin):
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        image_mean (:obj:`int`, defaults to [0.485, 0.456, 0.406]):
+        image_mean (:obj:`int`, defaults to [0.5, 0.5, 0.5]):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`int`, defaults to [0.229, 0.224, 0.225]):
+        image_std (:obj:`int`, defaults to [0.5, 0.5, 0.5]):
             The sequence of standard deviations for each channel, to be used when normalizing images.
         do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to normalize the input with mean and standard deviation.
         do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int`, `optional`, defaults to :obj:`List[224, 224]`):
+        size (:obj:`int`, `optional`, defaults to :obj:`224`):
             Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`.
     """
 
@@ -52,11 +52,11 @@ class ViTFeatureExtractor(FeatureExtractionMixin):
 
     def __init__(
         self,
-        image_mean=[0.485, 0.456, 0.406],
-        image_std=[0.229, 0.224, 0.225],
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
         do_normalize=True,
         do_resize=True,
-        size=[224, 224],
+        size=224,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -119,7 +119,7 @@ def __call__(
         # step 2: define transformations (resizing + normalization)
         transformations = []
         if self.do_resize and self.size is not None:
-            transformations.append(T.Resize(size=self.size))
+            transformations.append(T.Resize(size=(self.size, self.size)))
         if self.do_normalize:
             normalization = T.Compose([T.ToTensor(), T.Normalize(self.image_mean, self.image_std)])
             transformations.append(normalization)
diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index 713b9a7f824eb2..4e0e06a08eba2f 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -18,7 +18,6 @@
 
 import numpy as np
 
-from transformers import ViTFeatureExtractor
 from transformers.file_utils import is_torch_available, is_torchvision_available
 from transformers.testing_utils import require_torch, require_torchvision
 
@@ -30,6 +29,7 @@
 
 if is_torchvision_available():
     from PIL import Image
+    from transformers import ViTFeatureExtractor
 
 
 class ViTFeatureExtractionTester(unittest.TestCase):
@@ -41,11 +41,11 @@ def __init__(
         image_size=18,
         min_resolution=30,
         max_resolution=400,
-        image_mean=[0.485, 0.456, 0.406],
+        image_mean=[0.5, 0.5, 0.5],
         image_std=[0.5, 0.5, 0.5],
         do_normalize=True,
         do_resize=True,
-        size=[18, 18],
+        size=18,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -133,7 +133,8 @@ def test_call_pil(self):
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
         self.assertEqual(
-            encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size)
+            encoded_images.shape, (1, self.feature_extract_tester.num_channels, self.feature_extract_tester.size,
+            self.feature_extract_tester.size)
         )
 
         # Test batched
@@ -143,7 +144,8 @@ def test_call_pil(self):
             (
                 self.feature_extract_tester.batch_size,
                 self.feature_extract_tester.num_channels,
-                *self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size
             ),
         )
 
@@ -158,7 +160,8 @@ def test_call_numpy(self):
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
         self.assertEqual(
-            encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size)
+            encoded_images.shape, (1, self.feature_extract_tester.num_channels, self.feature_extract_tester.size,
+            self.feature_extract_tester.size)
         )
 
         # Test batched
@@ -168,7 +171,8 @@ def test_call_numpy(self):
             (
                 self.feature_extract_tester.batch_size,
                 self.feature_extract_tester.num_channels,
-                *self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size
             ),
         )
 
@@ -183,7 +187,8 @@ def test_call_pytorch(self):
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
         self.assertEqual(
-            encoded_images.shape, (1, self.feature_extract_tester.num_channels, *self.feature_extract_tester.size)
+            encoded_images.shape, (1, self.feature_extract_tester.num_channels, self.feature_extract_tester.size,
+            self.feature_extract_tester.size)
         )
 
         # Test batched
@@ -193,6 +198,7 @@ def test_call_pytorch(self):
             (
                 self.feature_extract_tester.batch_size,
                 self.feature_extract_tester.num_channels,
-                *self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size
             ),
         )
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index e33f350bd4a4c1..d6bee36562d33b 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -19,8 +19,8 @@
 import unittest
 
 import requests
-from transformers.file_utils import is_torch_available, is_torchvision_available
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.file_utils import is_torch_available, is_torchvision_available, cached_property
+from transformers.testing_utils import require_torch, require_torchvision, slow, torch_device
 
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -34,7 +34,7 @@
 
 
 if is_torchvision_available():
-    import torchvision.transforms as T
+    from transformers import ViTFeatureExtractor
     from PIL import Image
 
 
@@ -321,41 +321,36 @@ def test_model_from_pretrained(self):
 
 
 # We will verify our results on an image of cute cats
-# TODO: use ViTFeatureExtractor in the future
-def prepare_img(image_resolution):
+def prepare_img():
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    # standard PyTorch mean-std input image normalization
-    transform = T.Compose(
-        [
-            T.Resize((image_resolution, image_resolution)),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-
-    # mean-std normalize the input image (batch-size: 1)
-    img = transform(im).unsqueeze(0)
-
+    img = Image.open(requests.get(url, stream=True).raw)
     return img
 
 
 @require_torch
+@require_torchvision
 class ViTModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        # TODO: add .from_pretrained()
+        return ViTFeatureExtractor()
+    
     @slow
     def test_inference_image_classification_head(self):
         # TODO: replace namespace to google
         model = ViTForImageClassification.from_pretrained("nielsr/vit-base-patch16-224").to(torch_device)
-        pixel_values = prepare_img(224).to(torch_device)
 
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image).to(torch_device)
+        
         # forward pass
-        outputs = model(pixel_values)
+        outputs = model(**inputs)
 
         # verify the logits
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-0.7332, 0.7286, -0.4020]).to(torch_device)
+        expected_slice = torch.tensor([-0.2744,  0.8215, -0.0836]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From 03b763894699f8c00ab5e1f82cede3d9a5389420 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Mon, 22 Mar 2021 16:47:10 +0100
Subject: [PATCH 35/44] Fix bugs in tests

---
 setup.py                             |  2 +-
 src/transformers/__init__.py         |  2 +-
 tests/test_feature_extraction_vit.py | 39 +++++++++++++++++++---------
 tests/test_modeling_vit.py           | 14 +++++-----
 4 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/setup.py b/setup.py
index 16eefa899145b3..8c2c70dcc6b432 100644
--- a/setup.py
+++ b/setup.py
@@ -237,7 +237,7 @@ def run(self):
 extras["docs"] = deps_list("recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme", "sphinx-copybutton")
 extras["quality"] = deps_list("black", "isort", "flake8")
 
-extras["all"] = extras["tf"] + extras["torch"] + extras["flax"] + extras["sentencepiece"] + extras["tokenizers"]
+extras["all"] = extras["tf"] + extras["torch"] + extras["flax"] + extras["sentencepiece"] + extras["tokenizers"] + extras["speech"] + extras["vision"]
 
 extras["dev"] = (
     extras["all"]
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5900947db9d03b..aa2a2fe41665c6 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -211,7 +211,7 @@
         "TransfoXLCorpus",
         "TransfoXLTokenizer",
     ],
-    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig", "ViTFeatureExtractor"],
+    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
     "models.wav2vec2": [
         "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Wav2Vec2Config",
diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index 4e0e06a08eba2f..ed4164994eed0f 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from transformers.file_utils import is_torch_available, is_torchvision_available
-from transformers.testing_utils import require_torch, require_torchvision
+from transformers.testing_utils import require_torchvision
 
 from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
 
@@ -29,6 +29,7 @@
 
 if is_torchvision_available():
     from PIL import Image
+
     from transformers import ViTFeatureExtractor
 
 
@@ -99,10 +100,9 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
 
 
 @require_torchvision
-@require_torch
 class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
-    feature_extraction_class = ViTFeatureExtractor
+    feature_extraction_class = ViTFeatureExtractor if is_torchvision_available() else None
 
     def setUp(self):
         self.feature_extract_tester = ViTFeatureExtractionTester(self)
@@ -133,8 +133,13 @@ def test_call_pil(self):
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
         self.assertEqual(
-            encoded_images.shape, (1, self.feature_extract_tester.num_channels, self.feature_extract_tester.size,
-            self.feature_extract_tester.size)
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
         )
 
         # Test batched
@@ -145,7 +150,7 @@ def test_call_pil(self):
                 self.feature_extract_tester.batch_size,
                 self.feature_extract_tester.num_channels,
                 self.feature_extract_tester.size,
-                self.feature_extract_tester.size
+                self.feature_extract_tester.size,
             ),
         )
 
@@ -160,8 +165,13 @@ def test_call_numpy(self):
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
         self.assertEqual(
-            encoded_images.shape, (1, self.feature_extract_tester.num_channels, self.feature_extract_tester.size,
-            self.feature_extract_tester.size)
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
         )
 
         # Test batched
@@ -172,7 +182,7 @@ def test_call_numpy(self):
                 self.feature_extract_tester.batch_size,
                 self.feature_extract_tester.num_channels,
                 self.feature_extract_tester.size,
-                self.feature_extract_tester.size
+                self.feature_extract_tester.size,
             ),
         )
 
@@ -187,8 +197,13 @@ def test_call_pytorch(self):
         # Test not batched input
         encoded_images = feature_extractor(image_inputs[0]).pixel_values
         self.assertEqual(
-            encoded_images.shape, (1, self.feature_extract_tester.num_channels, self.feature_extract_tester.size,
-            self.feature_extract_tester.size)
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
         )
 
         # Test batched
@@ -199,6 +214,6 @@ def test_call_pytorch(self):
                 self.feature_extract_tester.batch_size,
                 self.feature_extract_tester.num_channels,
                 self.feature_extract_tester.size,
-                self.feature_extract_tester.size
+                self.feature_extract_tester.size,
             ),
         )
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index d6bee36562d33b..12e3f07e13ae25 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -19,7 +19,7 @@
 import unittest
 
 import requests
-from transformers.file_utils import is_torch_available, is_torchvision_available, cached_property
+from transformers.file_utils import cached_property, is_torch_available, is_torchvision_available
 from transformers.testing_utils import require_torch, require_torchvision, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -34,9 +34,10 @@
 
 
 if is_torchvision_available():
-    from transformers import ViTFeatureExtractor
     from PIL import Image
 
+    from transformers import ViTFeatureExtractor
+
 
 class ViTModelTester:
     def __init__(
@@ -327,14 +328,13 @@ def prepare_img():
     return img
 
 
-@require_torch
 @require_torchvision
 class ViTModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         # TODO: add .from_pretrained()
-        return ViTFeatureExtractor()
-    
+        return ViTFeatureExtractor() if is_torchvision_available() else None
+
     @slow
     def test_inference_image_classification_head(self):
         # TODO: replace namespace to google
@@ -343,7 +343,7 @@ def test_inference_image_classification_head(self):
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
         inputs = feature_extractor(images=image).to(torch_device)
-        
+
         # forward pass
         outputs = model(**inputs)
 
@@ -351,6 +351,6 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-0.2744,  0.8215, -0.0836]).to(torch_device)
+        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From f6556b5f8781a09ebdecf624c8b58c95958b00ac Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Mon, 22 Mar 2021 16:58:59 +0100
Subject: [PATCH 36/44] One more bug fix

---
 tests/test_feature_extraction_vit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index ed4164994eed0f..72167cc59f6829 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -102,7 +102,7 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
 @require_torchvision
 class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
-    feature_extraction_class = ViTFeatureExtractor if is_torchvision_available() else None
+    self.feature_extraction_class = ViTFeatureExtractor if is_torchvision_available() else None
 
     def setUp(self):
         self.feature_extract_tester = ViTFeatureExtractionTester(self)

From 884b7a7dba7c0273d82d9466394a3f15d6515f5e Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Mon, 22 Mar 2021 17:07:03 +0100
Subject: [PATCH 37/44] Revert previous change

---
 tests/test_feature_extraction_vit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_feature_extraction_vit.py b/tests/test_feature_extraction_vit.py
index 72167cc59f6829..ed4164994eed0f 100644
--- a/tests/test_feature_extraction_vit.py
+++ b/tests/test_feature_extraction_vit.py
@@ -102,7 +102,7 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
 @require_torchvision
 class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
 
-    self.feature_extraction_class = ViTFeatureExtractor if is_torchvision_available() else None
+    feature_extraction_class = ViTFeatureExtractor if is_torchvision_available() else None
 
     def setUp(self):
         self.feature_extract_tester = ViTFeatureExtractionTester(self)

From f35360e97b139af58a91e50b3443937ddb3b668c Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Mon, 22 Mar 2021 19:40:49 +0100
Subject: [PATCH 38/44] Address most comments by @sgugger @LysandreJik

---
 src/transformers/__init__.py                  |  4 +-
 src/transformers/models/auto/modeling_auto.py | 97 +++++++++++++++++++
 .../models/vit/configuration_vit.py           | 35 +------
 .../models/vit/feature_extraction_vit.py      | 20 ++--
 src/transformers/models/vit/modeling_vit.py   | 51 ++++------
 src/transformers/testing_utils.py             |  2 +-
 tests/test_modeling_vit.py                    | 12 ++-
 7 files changed, 148 insertions(+), 73 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index aa2a2fe41665c6..8b760598953d34 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -832,7 +832,6 @@
         [
             "VIT_PRETRAINED_MODEL_ARCHIVE_LIST",
             "ViTForImageClassification",
-            "ViTLayer",
             "ViTModel",
             "ViTPreTrainedModel",
         ]
@@ -1426,7 +1425,7 @@
         TransfoXLCorpus,
         TransfoXLTokenizer,
     )
-    from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig, ViTFeatureExtractor
+    from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
     from .models.wav2vec2 import (
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Wav2Vec2Config,
@@ -1951,7 +1950,6 @@
         from .models.vit import (
             VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViTForImageClassification,
-            ViTLayer,
             ViTModel,
             ViTPreTrainedModel,
         )
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 3c8f7bdcb07d00..cb4e84dc778eb8 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1913,3 +1913,100 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 ", ".join(c.__name__ for c in MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.keys()),
             )
         )
+
+
+class AutoModelForImageClassification:
+    r"""
+    This is a generic model class that will be instantiated as one of the model classes of the library---with an image
+    classification head---when created with the :meth:`~transformers.AutoModelForImageClassification.from_pretrained`
+    class method or the :meth:`~transformers.AutoModelForImageClassification.from_config` class method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoModelForImageClassification is designed to be instantiated "
+            "using the `AutoModelForImageClassification.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForImageClassification.from_config(config)` methods."
+        )
+
+    @classmethod
+    @replace_list_option_in_docstrings(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, use_model_types=False)
+    def from_config(cls, config):
+        r"""
+        Instantiates one of the model classes of the library---with an image classification head---from a
+        configuration.
+
+        Note:
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
+            model's configuration. Use :meth:`~transformers.AutoModelForImageClassification.from_pretrained` to load
+            the model weights.
+
+        Args:
+            config (:class:`~transformers.PretrainedConfig`):
+                The model class to instantiate is selected based on the configuration class:
+
+                List options
+
+        Examples::
+
+            >>> from transformers import AutoConfig, AutoModelForImageClassification
+            >>> # Download configuration from huggingface.co and cache.
+            >>> config = AutoConfig.from_pretrained('google/vit_base_patch16_224')
+            >>> model = AutoModelForImageClassification.from_config(config)
+        """
+        if type(config) in MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys():
+            return MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING[type(config)](config)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__,
+                cls.__name__,
+                ", ".join(c.__name__ for c in MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys()),
+            )
+        )
+
+    @classmethod
+    @replace_list_option_in_docstrings(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
+    @add_start_docstrings(
+        "Instantiate one of the model classes of the library---with an image classification head---from a "
+        "pretrained model.",
+        AUTO_MODEL_PRETRAINED_DOCSTRING,
+    )
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""
+        Examples::
+
+            >>> from transformers import AutoConfig, AutoModelForImageClassification
+
+            >>> # Download model and configuration from huggingface.co and cache.
+            >>> model = AutoModelForImageClassification.from_pretrained('google/vit_base_patch16_224')
+
+            >>> # Update configuration during loading
+            >>> model = AutoModelForImageClassification.from_pretrained('google/vit_base_patch16_224', output_attentions=True)
+            >>> model.config.output_attentions
+            True
+
+            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            >>> config = AutoConfig.from_json_file('./tf_model/vit_tf_model_config.json')
+            >>> model = AutoModelForImageClassification.from_pretrained('./tf_model/vit_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        """
+        config = kwargs.pop("config", None)
+        if not isinstance(config, PretrainedConfig):
+            config, kwargs = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs
+            )
+
+        if type(config) in MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys():
+            return MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING[type(config)].from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **kwargs
+            )
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__,
+                cls.__name__,
+                ", ".join(c.__name__ for c in MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys()),
+            )
+        )
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index 09e62e2abecb4d..5e53df4cddfd7d 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 Google AI and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 logger = logging.get_logger(__name__)
 
 VIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "vit-base-patch16-224": "https://huggingface.co/vit-base-patch16-224/resolve/main/config.json",
+    "nielsr/vit-base-patch16-224": "https://huggingface.co/vit-base-patch16-224/resolve/main/config.json",
     # See all ViT models at https://huggingface.co/models?filter=vit
 }
 
@@ -30,19 +30,14 @@ class ViTConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a :class:`~transformers.ViTModel`. It is used to
     instantiate an ViT model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the ViT `vit-base-patch16-224
-    <https://huggingface.co/vit-base-patch16-224>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the ViT `google/vit-base-patch16-224
+    <https://huggingface.co/google/vit-base-patch16-224>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
-            Vocabulary size of the ViT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.ViTModel` or :class:`~transformers.TFViTModel`.
-            Vocabulary size of the model. Defines the different tokens that can be represented by the `inputs_ids`
-            passed to the forward method of :class:`~transformers.ViTModel`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
@@ -58,19 +53,10 @@ class ViTConfig(PretrainedConfig):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ViTModel` or
-            :class:`~transformers.TFViTModel`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
         gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
         image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
@@ -98,7 +84,6 @@ class ViTConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=30522,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -106,24 +91,16 @@ def __init__(
         hidden_act="gelu",
         hidden_dropout_prob=0.0,
         attention_probs_dropout_prob=0.0,
-        max_position_embeddings=512,
-        type_vocab_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        use_cache=True,
         is_encoder_decoder=False,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
         image_size=224,
         patch_size=16,
         num_channels=3,
         **kwargs
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
 
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
@@ -132,9 +109,7 @@ def __init__(
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
-        self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
-        self.use_cache = use_cache
 
         self.image_size = image_size
         self.patch_size = patch_size
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index 6b74814bd05cb3..08aae9b49f1b5a 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -19,7 +19,7 @@
 import numpy as np
 import torch
 from PIL import Image
-from torchvision import transforms as T
+from torchvision.transforms import Compose, Normalize, Resize, ToPILImage, ToTensor
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from ...utils import logging
@@ -76,9 +76,13 @@ def __call__(
 
         Args:
             images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, numpy array or a Torch
-                tensor. In case of a numpy array/Torch tensor, each image should be of shape (C, H, W), where C is a
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or a PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
+
+        Returns:
+            :obj:`torch.Tensor` of shape :obj:`(batch_size, num_channels, height, width)`: A PyTorch tensor containing
+            the preprocessed images.
         """
         # Input type checking for clearer error
         valid_images = False
@@ -106,24 +110,24 @@ def __call__(
                 # PIL expects the channel dimension as last dimension
                 images = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in images]
             elif isinstance(images[0], torch.Tensor):
-                images = [T.ToPILImage()(image).convert("RGB") for image in images]
+                images = [ToPILImage()(image).convert("RGB") for image in images]
         else:
             if isinstance(images, np.ndarray):
                 # PIL expects the channel dimension as last dimension
                 images = [Image.fromarray(np.moveaxis(images, 0, -1))]
             elif isinstance(images, torch.Tensor):
-                images = [T.ToPILImage()(images).convert("RGB")]
+                images = [ToPILImage()(images).convert("RGB")]
             else:
                 images = [images]
 
         # step 2: define transformations (resizing + normalization)
         transformations = []
         if self.do_resize and self.size is not None:
-            transformations.append(T.Resize(size=(self.size, self.size)))
+            transformations.append(Resize(size=(self.size, self.size)))
         if self.do_normalize:
-            normalization = T.Compose([T.ToTensor(), T.Normalize(self.image_mean, self.image_std)])
+            normalization = Compose([ToTensor(), Normalize(self.image_mean, self.image_std)])
             transformations.append(normalization)
-        transforms = T.Compose(transformations)
+        transforms = Compose(transformations)
 
         # step 3: apply transformations to images and stack
         pixel_values = [transforms(image) for image in images]
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 164e6f258dac95..a2d9e39654096f 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -42,7 +42,7 @@
 ]
 
 
-# Copied from
+# Inspired by
 # https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
 # From PyTorch internals
 def _ntuple(n):
@@ -56,13 +56,12 @@ def parse(x):
 
 to_2tuple = _ntuple(2)
 
-
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
 class ViTEmbeddings(nn.Module):
     """
-    Construct the cls token, position and patch embeddings.
+    Construct the CLS token, position and patch embeddings.
 
-    Based on timm implementation, which can be found here:
-    https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
     """
 
     def __init__(self, config):
@@ -90,12 +89,12 @@ def forward(self, pixel_values):
         return embeddings
 
 
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
 class PatchEmbeddings(nn.Module):
     """
     Image to Patch Embedding.
 
-    Based on timm implementation, which can be found here:
-    https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
     """
 
     def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768):
@@ -112,9 +111,10 @@ def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768)
     def forward(self, pixel_values):
         batch_size, num_channels, height, width = pixel_values.shape
         # FIXME look at relaxing size constraints
-        assert (
-            height == self.image_size[0] and width == self.image_size[1]
-        ), f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
         x = self.projection(pixel_values).flatten(2).transpose(1, 2)
         return x
 
@@ -143,12 +143,7 @@ def transpose_for_scores(self, x):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(
-        self,
-        hidden_states,
-        head_mask=None,
-        output_attentions=False,
-    ):
+    def forward(self, hidden_states, head_mask=None, output_attentions=False):
         mixed_query_layer = self.query(hidden_states)
 
         key_layer = self.transpose_for_scores(self.key(hidden_states))
@@ -309,7 +304,7 @@ def forward(
         # in ViT, layernorm is also applied after self-attention
         layer_output = self.layernorm_after(hidden_states)
 
-        # feedforward chunking not working for now
+        # TODO feedforward chunking not working for now
         # layer_output = apply_chunking_to_forward(
         #     self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, layer_output
         # )
@@ -381,15 +376,7 @@ def custom_forward(*inputs):
             all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    all_hidden_states,
-                    all_self_attentions,
-                ]
-                if v is not None
-            )
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
         return BaseModelOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
@@ -408,19 +395,23 @@ class ViTPreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         """ Initialize the weights """
-        if isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)):
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        if isinstance(module, (nn.Linear, nn.Conv2d)) and module.bias is not None:
-            module.bias.data.zero_()
 
 
 VIT_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ subclass. Use
     it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
     behavior.
 
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 7f682bee749a99..8230524d138ea6 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -248,7 +248,7 @@ def require_torchvision(test_case):
     These tests are skipped when torchvision isn't installed.
 
     """
-    if not is_torchvision_available:
+    if not is_torchvision_available():
         return unittest.skip("test requires torchvision")(test_case)
     else:
         return test_case
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 12e3f07e13ae25..da3b1e9e6e27f5 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -159,7 +159,17 @@ def setUp(self):
         self.config_tester = ConfigTester(self, config_class=ViTConfig, hidden_size=37)
 
     def test_config(self):
-        self.config_tester.run_common_tests()
+        config = self.config_tester.config_class(**self.config_tester.inputs_dict)
+        # we omit vocab_size since ViT does not use this
+        self.config_tester.parent.assertTrue(hasattr(config, "hidden_size"))
+        self.config_tester.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.config_tester.parent.assertTrue(hasattr(config, "num_hidden_layers"))
+
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
 
     def test_inputs_embeds(self):
         # ViT does not use inputs_embeds

From f9a1ac6d18e42714407dab3d0f274c6afc2b202a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 23 Mar 2021 09:46:27 +0100
Subject: [PATCH 39/44] Update conversion script

---
 .../models/vit/convert_vit_timm_to_pytorch.py | 1054 ++++++++++++++++-
 src/transformers/models/vit/modeling_vit.py   |    2 +
 src/transformers/utils/dummy_pt_objects.py    |    5 -
 3 files changed, 1040 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index da69156c7dfa81..5325ccdbaf132a 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -31,6 +31,1010 @@
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
 
+# ImageNet 2012 id's to class names
+id2class = {
+    0: "tench, Tinca tinca",
+    1: "goldfish, Carassius auratus",
+    2: "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+    3: "tiger shark, Galeocerdo cuvieri",
+    4: "hammerhead, hammerhead shark",
+    5: "electric ray, crampfish, numbfish, torpedo",
+    6: "stingray",
+    7: "cock",
+    8: "hen",
+    9: "ostrich, Struthio camelus",
+    10: "brambling, Fringilla montifringilla",
+    11: "goldfinch, Carduelis carduelis",
+    12: "house finch, linnet, Carpodacus mexicanus",
+    13: "junco, snowbird",
+    14: "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    15: "robin, American robin, Turdus migratorius",
+    16: "bulbul",
+    17: "jay",
+    18: "magpie",
+    19: "chickadee",
+    20: "water ouzel, dipper",
+    21: "kite",
+    22: "bald eagle, American eagle, Haliaeetus leucocephalus",
+    23: "vulture",
+    24: "great grey owl, great gray owl, Strix nebulosa",
+    25: "European fire salamander, Salamandra salamandra",
+    26: "common newt, Triturus vulgaris",
+    27: "eft",
+    28: "spotted salamander, Ambystoma maculatum",
+    29: "axolotl, mud puppy, Ambystoma mexicanum",
+    30: "bullfrog, Rana catesbeiana",
+    31: "tree frog, tree-frog",
+    32: "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    33: "loggerhead, loggerhead turtle, Caretta caretta",
+    34: "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    35: "mud turtle",
+    36: "terrapin",
+    37: "box turtle, box tortoise",
+    38: "banded gecko",
+    39: "common iguana, iguana, Iguana iguana",
+    40: "American chameleon, anole, Anolis carolinensis",
+    41: "whiptail, whiptail lizard",
+    42: "agama",
+    43: "frilled lizard, Chlamydosaurus kingi",
+    44: "alligator lizard",
+    45: "Gila monster, Heloderma suspectum",
+    46: "green lizard, Lacerta viridis",
+    47: "African chameleon, Chamaeleo chamaeleon",
+    48: "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+    49: "African crocodile, Nile crocodile, Crocodylus niloticus",
+    50: "American alligator, Alligator mississipiensis",
+    51: "triceratops",
+    52: "thunder snake, worm snake, Carphophis amoenus",
+    53: "ringneck snake, ring-necked snake, ring snake",
+    54: "hognose snake, puff adder, sand viper",
+    55: "green snake, grass snake",
+    56: "king snake, kingsnake",
+    57: "garter snake, grass snake",
+    58: "water snake",
+    59: "vine snake",
+    60: "night snake, Hypsiglena torquata",
+    61: "boa constrictor, Constrictor constrictor",
+    62: "rock python, rock snake, Python sebae",
+    63: "Indian cobra, Naja naja",
+    64: "green mamba",
+    65: "sea snake",
+    66: "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    67: "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    68: "sidewinder, horned rattlesnake, Crotalus cerastes",
+    69: "trilobite",
+    70: "harvestman, daddy longlegs, Phalangium opilio",
+    71: "scorpion",
+    72: "black and gold garden spider, Argiope aurantia",
+    73: "barn spider, Araneus cavaticus",
+    74: "garden spider, Aranea diademata",
+    75: "black widow, Latrodectus mactans",
+    76: "tarantula",
+    77: "wolf spider, hunting spider",
+    78: "tick",
+    79: "centipede",
+    80: "black grouse",
+    81: "ptarmigan",
+    82: "ruffed grouse, partridge, Bonasa umbellus",
+    83: "prairie chicken, prairie grouse, prairie fowl",
+    84: "peacock",
+    85: "quail",
+    86: "partridge",
+    87: "African grey, African gray, Psittacus erithacus",
+    88: "macaw",
+    89: "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    90: "lorikeet",
+    91: "coucal",
+    92: "bee eater",
+    93: "hornbill",
+    94: "hummingbird",
+    95: "jacamar",
+    96: "toucan",
+    97: "drake",
+    98: "red-breasted merganser, Mergus serrator",
+    99: "goose",
+    100: "black swan, Cygnus atratus",
+    101: "tusker",
+    102: "echidna, spiny anteater, anteater",
+    103: "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+    104: "wallaby, brush kangaroo",
+    105: "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    106: "wombat",
+    107: "jellyfish",
+    108: "sea anemone, anemone",
+    109: "brain coral",
+    110: "flatworm, platyhelminth",
+    111: "nematode, nematode worm, roundworm",
+    112: "conch",
+    113: "snail",
+    114: "slug",
+    115: "sea slug, nudibranch",
+    116: "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    117: "chambered nautilus, pearly nautilus, nautilus",
+    118: "Dungeness crab, Cancer magister",
+    119: "rock crab, Cancer irroratus",
+    120: "fiddler crab",
+    121: "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+    122: "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    123: "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    124: "crayfish, crawfish, crawdad, crawdaddy",
+    125: "hermit crab",
+    126: "isopod",
+    127: "white stork, Ciconia ciconia",
+    128: "black stork, Ciconia nigra",
+    129: "spoonbill",
+    130: "flamingo",
+    131: "little blue heron, Egretta caerulea",
+    132: "American egret, great white heron, Egretta albus",
+    133: "bittern",
+    134: "crane",
+    135: "limpkin, Aramus pictus",
+    136: "European gallinule, Porphyrio porphyrio",
+    137: "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    138: "bustard",
+    139: "ruddy turnstone, Arenaria interpres",
+    140: "red-backed sandpiper, dunlin, Erolia alpina",
+    141: "redshank, Tringa totanus",
+    142: "dowitcher",
+    143: "oystercatcher, oyster catcher",
+    144: "pelican",
+    145: "king penguin, Aptenodytes patagonica",
+    146: "albatross, mollymawk",
+    147: "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+    148: "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    149: "dugong, Dugong dugon",
+    150: "sea lion",
+    151: "Chihuahua",
+    152: "Japanese spaniel",
+    153: "Maltese dog, Maltese terrier, Maltese",
+    154: "Pekinese, Pekingese, Peke",
+    155: "Shih-Tzu",
+    156: "Blenheim spaniel",
+    157: "papillon",
+    158: "toy terrier",
+    159: "Rhodesian ridgeback",
+    160: "Afghan hound, Afghan",
+    161: "basset, basset hound",
+    162: "beagle",
+    163: "bloodhound, sleuthhound",
+    164: "bluetick",
+    165: "black-and-tan coonhound",
+    166: "Walker hound, Walker foxhound",
+    167: "English foxhound",
+    168: "redbone",
+    169: "borzoi, Russian wolfhound",
+    170: "Irish wolfhound",
+    171: "Italian greyhound",
+    172: "whippet",
+    173: "Ibizan hound, Ibizan Podenco",
+    174: "Norwegian elkhound, elkhound",
+    175: "otterhound, otter hound",
+    176: "Saluki, gazelle hound",
+    177: "Scottish deerhound, deerhound",
+    178: "Weimaraner",
+    179: "Staffordshire bullterrier, Staffordshire bull terrier",
+    180: "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+    181: "Bedlington terrier",
+    182: "Border terrier",
+    183: "Kerry blue terrier",
+    184: "Irish terrier",
+    185: "Norfolk terrier",
+    186: "Norwich terrier",
+    187: "Yorkshire terrier",
+    188: "wire-haired fox terrier",
+    189: "Lakeland terrier",
+    190: "Sealyham terrier, Sealyham",
+    191: "Airedale, Airedale terrier",
+    192: "cairn, cairn terrier",
+    193: "Australian terrier",
+    194: "Dandie Dinmont, Dandie Dinmont terrier",
+    195: "Boston bull, Boston terrier",
+    196: "miniature schnauzer",
+    197: "giant schnauzer",
+    198: "standard schnauzer",
+    199: "Scotch terrier, Scottish terrier, Scottie",
+    200: "Tibetan terrier, chrysanthemum dog",
+    201: "silky terrier, Sydney silky",
+    202: "soft-coated wheaten terrier",
+    203: "West Highland white terrier",
+    204: "Lhasa, Lhasa apso",
+    205: "flat-coated retriever",
+    206: "curly-coated retriever",
+    207: "golden retriever",
+    208: "Labrador retriever",
+    209: "Chesapeake Bay retriever",
+    210: "German short-haired pointer",
+    211: "vizsla, Hungarian pointer",
+    212: "English setter",
+    213: "Irish setter, red setter",
+    214: "Gordon setter",
+    215: "Brittany spaniel",
+    216: "clumber, clumber spaniel",
+    217: "English springer, English springer spaniel",
+    218: "Welsh springer spaniel",
+    219: "cocker spaniel, English cocker spaniel, cocker",
+    220: "Sussex spaniel",
+    221: "Irish water spaniel",
+    222: "kuvasz",
+    223: "schipperke",
+    224: "groenendael",
+    225: "malinois",
+    226: "briard",
+    227: "kelpie",
+    228: "komondor",
+    229: "Old English sheepdog, bobtail",
+    230: "Shetland sheepdog, Shetland sheep dog, Shetland",
+    231: "collie",
+    232: "Border collie",
+    233: "Bouvier des Flandres, Bouviers des Flandres",
+    234: "Rottweiler",
+    235: "German shepherd, German shepherd dog, German police dog, alsatian",
+    236: "Doberman, Doberman pinscher",
+    237: "miniature pinscher",
+    238: "Greater Swiss Mountain dog",
+    239: "Bernese mountain dog",
+    240: "Appenzeller",
+    241: "EntleBucher",
+    242: "boxer",
+    243: "bull mastiff",
+    244: "Tibetan mastiff",
+    245: "French bulldog",
+    246: "Great Dane",
+    247: "Saint Bernard, St Bernard",
+    248: "Eskimo dog, husky",
+    249: "malamute, malemute, Alaskan malamute",
+    250: "Siberian husky",
+    251: "dalmatian, coach dog, carriage dog",
+    252: "affenpinscher, monkey pinscher, monkey dog",
+    253: "basenji",
+    254: "pug, pug-dog",
+    255: "Leonberg",
+    256: "Newfoundland, Newfoundland dog",
+    257: "Great Pyrenees",
+    258: "Samoyed, Samoyede",
+    259: "Pomeranian",
+    260: "chow, chow chow",
+    261: "keeshond",
+    262: "Brabancon griffon",
+    263: "Pembroke, Pembroke Welsh corgi",
+    264: "Cardigan, Cardigan Welsh corgi",
+    265: "toy poodle",
+    266: "miniature poodle",
+    267: "standard poodle",
+    268: "Mexican hairless",
+    269: "timber wolf, grey wolf, gray wolf, Canis lupus",
+    270: "white wolf, Arctic wolf, Canis lupus tundrarum",
+    271: "red wolf, maned wolf, Canis rufus, Canis niger",
+    272: "coyote, prairie wolf, brush wolf, Canis latrans",
+    273: "dingo, warrigal, warragal, Canis dingo",
+    274: "dhole, Cuon alpinus",
+    275: "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    276: "hyena, hyaena",
+    277: "red fox, Vulpes vulpes",
+    278: "kit fox, Vulpes macrotis",
+    279: "Arctic fox, white fox, Alopex lagopus",
+    280: "grey fox, gray fox, Urocyon cinereoargenteus",
+    281: "tabby, tabby cat",
+    282: "tiger cat",
+    283: "Persian cat",
+    284: "Siamese cat, Siamese",
+    285: "Egyptian cat",
+    286: "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    287: "lynx, catamount",
+    288: "leopard, Panthera pardus",
+    289: "snow leopard, ounce, Panthera uncia",
+    290: "jaguar, panther, Panthera onca, Felis onca",
+    291: "lion, king of beasts, Panthera leo",
+    292: "tiger, Panthera tigris",
+    293: "cheetah, chetah, Acinonyx jubatus",
+    294: "brown bear, bruin, Ursus arctos",
+    295: "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    296: "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    297: "sloth bear, Melursus ursinus, Ursus ursinus",
+    298: "mongoose",
+    299: "meerkat, mierkat",
+    300: "tiger beetle",
+    301: "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    302: "ground beetle, carabid beetle",
+    303: "long-horned beetle, longicorn, longicorn beetle",
+    304: "leaf beetle, chrysomelid",
+    305: "dung beetle",
+    306: "rhinoceros beetle",
+    307: "weevil",
+    308: "fly",
+    309: "bee",
+    310: "ant, emmet, pismire",
+    311: "grasshopper, hopper",
+    312: "cricket",
+    313: "walking stick, walkingstick, stick insect",
+    314: "cockroach, roach",
+    315: "mantis, mantid",
+    316: "cicada, cicala",
+    317: "leafhopper",
+    318: "lacewing, lacewing fly",
+    319: "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    320: "damselfly",
+    321: "admiral",
+    322: "ringlet, ringlet butterfly",
+    323: "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    324: "cabbage butterfly",
+    325: "sulphur butterfly, sulfur butterfly",
+    326: "lycaenid, lycaenid butterfly",
+    327: "starfish, sea star",
+    328: "sea urchin",
+    329: "sea cucumber, holothurian",
+    330: "wood rabbit, cottontail, cottontail rabbit",
+    331: "hare",
+    332: "Angora, Angora rabbit",
+    333: "hamster",
+    334: "porcupine, hedgehog",
+    335: "fox squirrel, eastern fox squirrel, Sciurus niger",
+    336: "marmot",
+    337: "beaver",
+    338: "guinea pig, Cavia cobaya",
+    339: "sorrel",
+    340: "zebra",
+    341: "hog, pig, grunter, squealer, Sus scrofa",
+    342: "wild boar, boar, Sus scrofa",
+    343: "warthog",
+    344: "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    345: "ox",
+    346: "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    347: "bison",
+    348: "ram, tup",
+    349: "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+    350: "ibex, Capra ibex",
+    351: "hartebeest",
+    352: "impala, Aepyceros melampus",
+    353: "gazelle",
+    354: "Arabian camel, dromedary, Camelus dromedarius",
+    355: "llama",
+    356: "weasel",
+    357: "mink",
+    358: "polecat, fitch, foulmart, foumart, Mustela putorius",
+    359: "black-footed ferret, ferret, Mustela nigripes",
+    360: "otter",
+    361: "skunk, polecat, wood pussy",
+    362: "badger",
+    363: "armadillo",
+    364: "three-toed sloth, ai, Bradypus tridactylus",
+    365: "orangutan, orang, orangutang, Pongo pygmaeus",
+    366: "gorilla, Gorilla gorilla",
+    367: "chimpanzee, chimp, Pan troglodytes",
+    368: "gibbon, Hylobates lar",
+    369: "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    370: "guenon, guenon monkey",
+    371: "patas, hussar monkey, Erythrocebus patas",
+    372: "baboon",
+    373: "macaque",
+    374: "langur",
+    375: "colobus, colobus monkey",
+    376: "proboscis monkey, Nasalis larvatus",
+    377: "marmoset",
+    378: "capuchin, ringtail, Cebus capucinus",
+    379: "howler monkey, howler",
+    380: "titi, titi monkey",
+    381: "spider monkey, Ateles geoffroyi",
+    382: "squirrel monkey, Saimiri sciureus",
+    383: "Madagascar cat, ring-tailed lemur, Lemur catta",
+    384: "indri, indris, Indri indri, Indri brevicaudatus",
+    385: "Indian elephant, Elephas maximus",
+    386: "African elephant, Loxodonta africana",
+    387: "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    388: "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    389: "barracouta, snoek",
+    390: "eel",
+    391: "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    392: "rock beauty, Holocanthus tricolor",
+    393: "anemone fish",
+    394: "sturgeon",
+    395: "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    396: "lionfish",
+    397: "puffer, pufferfish, blowfish, globefish",
+    398: "abacus",
+    399: "abaya",
+    400: "academic gown, academic robe, judge's robe",
+    401: "accordion, piano accordion, squeeze box",
+    402: "acoustic guitar",
+    403: "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    404: "airliner",
+    405: "airship, dirigible",
+    406: "altar",
+    407: "ambulance",
+    408: "amphibian, amphibious vehicle",
+    409: "analog clock",
+    410: "apiary, bee house",
+    411: "apron",
+    412: "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+    413: "assault rifle, assault gun",
+    414: "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    415: "bakery, bakeshop, bakehouse",
+    416: "balance beam, beam",
+    417: "balloon",
+    418: "ballpoint, ballpoint pen, ballpen, Biro",
+    419: "Band Aid",
+    420: "banjo",
+    421: "bannister, banister, balustrade, balusters, handrail",
+    422: "barbell",
+    423: "barber chair",
+    424: "barbershop",
+    425: "barn",
+    426: "barometer",
+    427: "barrel, cask",
+    428: "barrow, garden cart, lawn cart, wheelbarrow",
+    429: "baseball",
+    430: "basketball",
+    431: "bassinet",
+    432: "bassoon",
+    433: "bathing cap, swimming cap",
+    434: "bath towel",
+    435: "bathtub, bathing tub, bath, tub",
+    436: "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+    437: "beacon, lighthouse, beacon light, pharos",
+    438: "beaker",
+    439: "bearskin, busby, shako",
+    440: "beer bottle",
+    441: "beer glass",
+    442: "bell cote, bell cot",
+    443: "bib",
+    444: "bicycle-built-for-two, tandem bicycle, tandem",
+    445: "bikini, two-piece",
+    446: "binder, ring-binder",
+    447: "binoculars, field glasses, opera glasses",
+    448: "birdhouse",
+    449: "boathouse",
+    450: "bobsled, bobsleigh, bob",
+    451: "bolo tie, bolo, bola tie, bola",
+    452: "bonnet, poke bonnet",
+    453: "bookcase",
+    454: "bookshop, bookstore, bookstall",
+    455: "bottlecap",
+    456: "bow",
+    457: "bow tie, bow-tie, bowtie",
+    458: "brass, memorial tablet, plaque",
+    459: "brassiere, bra, bandeau",
+    460: "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    461: "breastplate, aegis, egis",
+    462: "broom",
+    463: "bucket, pail",
+    464: "buckle",
+    465: "bulletproof vest",
+    466: "bullet train, bullet",
+    467: "butcher shop, meat market",
+    468: "cab, hack, taxi, taxicab",
+    469: "caldron, cauldron",
+    470: "candle, taper, wax light",
+    471: "cannon",
+    472: "canoe",
+    473: "can opener, tin opener",
+    474: "cardigan",
+    475: "car mirror",
+    476: "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    477: "carpenter's kit, tool kit",
+    478: "carton",
+    479: "car wheel",
+    480: "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+    481: "cassette",
+    482: "cassette player",
+    483: "castle",
+    484: "catamaran",
+    485: "CD player",
+    486: "cello, violoncello",
+    487: "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    488: "chain",
+    489: "chainlink fence",
+    490: "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+    491: "chain saw, chainsaw",
+    492: "chest",
+    493: "chiffonier, commode",
+    494: "chime, bell, gong",
+    495: "china cabinet, china closet",
+    496: "Christmas stocking",
+    497: "church, church building",
+    498: "cinema, movie theater, movie theatre, movie house, picture palace",
+    499: "cleaver, meat cleaver, chopper",
+    500: "cliff dwelling",
+    501: "cloak",
+    502: "clog, geta, patten, sabot",
+    503: "cocktail shaker",
+    504: "coffee mug",
+    505: "coffeepot",
+    506: "coil, spiral, volute, whorl, helix",
+    507: "combination lock",
+    508: "computer keyboard, keypad",
+    509: "confectionery, confectionary, candy store",
+    510: "container ship, containership, container vessel",
+    511: "convertible",
+    512: "corkscrew, bottle screw",
+    513: "cornet, horn, trumpet, trump",
+    514: "cowboy boot",
+    515: "cowboy hat, ten-gallon hat",
+    516: "cradle",
+    517: "crane",
+    518: "crash helmet",
+    519: "crate",
+    520: "crib, cot",
+    521: "Crock Pot",
+    522: "croquet ball",
+    523: "crutch",
+    524: "cuirass",
+    525: "dam, dike, dyke",
+    526: "desk",
+    527: "desktop computer",
+    528: "dial telephone, dial phone",
+    529: "diaper, nappy, napkin",
+    530: "digital clock",
+    531: "digital watch",
+    532: "dining table, board",
+    533: "dishrag, dishcloth",
+    534: "dishwasher, dish washer, dishwashing machine",
+    535: "disk brake, disc brake",
+    536: "dock, dockage, docking facility",
+    537: "dogsled, dog sled, dog sleigh",
+    538: "dome",
+    539: "doormat, welcome mat",
+    540: "drilling platform, offshore rig",
+    541: "drum, membranophone, tympan",
+    542: "drumstick",
+    543: "dumbbell",
+    544: "Dutch oven",
+    545: "electric fan, blower",
+    546: "electric guitar",
+    547: "electric locomotive",
+    548: "entertainment center",
+    549: "envelope",
+    550: "espresso maker",
+    551: "face powder",
+    552: "feather boa, boa",
+    553: "file, file cabinet, filing cabinet",
+    554: "fireboat",
+    555: "fire engine, fire truck",
+    556: "fire screen, fireguard",
+    557: "flagpole, flagstaff",
+    558: "flute, transverse flute",
+    559: "folding chair",
+    560: "football helmet",
+    561: "forklift",
+    562: "fountain",
+    563: "fountain pen",
+    564: "four-poster",
+    565: "freight car",
+    566: "French horn, horn",
+    567: "frying pan, frypan, skillet",
+    568: "fur coat",
+    569: "garbage truck, dustcart",
+    570: "gasmask, respirator, gas helmet",
+    571: "gas pump, gasoline pump, petrol pump, island dispenser",
+    572: "goblet",
+    573: "go-kart",
+    574: "golf ball",
+    575: "golfcart, golf cart",
+    576: "gondola",
+    577: "gong, tam-tam",
+    578: "gown",
+    579: "grand piano, grand",
+    580: "greenhouse, nursery, glasshouse",
+    581: "grille, radiator grille",
+    582: "grocery store, grocery, food market, market",
+    583: "guillotine",
+    584: "hair slide",
+    585: "hair spray",
+    586: "half track",
+    587: "hammer",
+    588: "hamper",
+    589: "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    590: "hand-held computer, hand-held microcomputer",
+    591: "handkerchief, hankie, hanky, hankey",
+    592: "hard disc, hard disk, fixed disk",
+    593: "harmonica, mouth organ, harp, mouth harp",
+    594: "harp",
+    595: "harvester, reaper",
+    596: "hatchet",
+    597: "holster",
+    598: "home theater, home theatre",
+    599: "honeycomb",
+    600: "hook, claw",
+    601: "hoopskirt, crinoline",
+    602: "horizontal bar, high bar",
+    603: "horse cart, horse-cart",
+    604: "hourglass",
+    605: "iPod",
+    606: "iron, smoothing iron",
+    607: "jack-o'-lantern",
+    608: "jean, blue jean, denim",
+    609: "jeep, landrover",
+    610: "jersey, T-shirt, tee shirt",
+    611: "jigsaw puzzle",
+    612: "jinrikisha, ricksha, rickshaw",
+    613: "joystick",
+    614: "kimono",
+    615: "knee pad",
+    616: "knot",
+    617: "lab coat, laboratory coat",
+    618: "ladle",
+    619: "lampshade, lamp shade",
+    620: "laptop, laptop computer",
+    621: "lawn mower, mower",
+    622: "lens cap, lens cover",
+    623: "letter opener, paper knife, paperknife",
+    624: "library",
+    625: "lifeboat",
+    626: "lighter, light, igniter, ignitor",
+    627: "limousine, limo",
+    628: "liner, ocean liner",
+    629: "lipstick, lip rouge",
+    630: "Loafer",
+    631: "lotion",
+    632: "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    633: "loupe, jeweler's loupe",
+    634: "lumbermill, sawmill",
+    635: "magnetic compass",
+    636: "mailbag, postbag",
+    637: "mailbox, letter box",
+    638: "maillot",
+    639: "maillot, tank suit",
+    640: "manhole cover",
+    641: "maraca",
+    642: "marimba, xylophone",
+    643: "mask",
+    644: "matchstick",
+    645: "maypole",
+    646: "maze, labyrinth",
+    647: "measuring cup",
+    648: "medicine chest, medicine cabinet",
+    649: "megalith, megalithic structure",
+    650: "microphone, mike",
+    651: "microwave, microwave oven",
+    652: "military uniform",
+    653: "milk can",
+    654: "minibus",
+    655: "miniskirt, mini",
+    656: "minivan",
+    657: "missile",
+    658: "mitten",
+    659: "mixing bowl",
+    660: "mobile home, manufactured home",
+    661: "Model T",
+    662: "modem",
+    663: "monastery",
+    664: "monitor",
+    665: "moped",
+    666: "mortar",
+    667: "mortarboard",
+    668: "mosque",
+    669: "mosquito net",
+    670: "motor scooter, scooter",
+    671: "mountain bike, all-terrain bike, off-roader",
+    672: "mountain tent",
+    673: "mouse, computer mouse",
+    674: "mousetrap",
+    675: "moving van",
+    676: "muzzle",
+    677: "nail",
+    678: "neck brace",
+    679: "necklace",
+    680: "nipple",
+    681: "notebook, notebook computer",
+    682: "obelisk",
+    683: "oboe, hautboy, hautbois",
+    684: "ocarina, sweet potato",
+    685: "odometer, hodometer, mileometer, milometer",
+    686: "oil filter",
+    687: "organ, pipe organ",
+    688: "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    689: "overskirt",
+    690: "oxcart",
+    691: "oxygen mask",
+    692: "packet",
+    693: "paddle, boat paddle",
+    694: "paddlewheel, paddle wheel",
+    695: "padlock",
+    696: "paintbrush",
+    697: "pajama, pyjama, pj's, jammies",
+    698: "palace",
+    699: "panpipe, pandean pipe, syrinx",
+    700: "paper towel",
+    701: "parachute, chute",
+    702: "parallel bars, bars",
+    703: "park bench",
+    704: "parking meter",
+    705: "passenger car, coach, carriage",
+    706: "patio, terrace",
+    707: "pay-phone, pay-station",
+    708: "pedestal, plinth, footstall",
+    709: "pencil box, pencil case",
+    710: "pencil sharpener",
+    711: "perfume, essence",
+    712: "Petri dish",
+    713: "photocopier",
+    714: "pick, plectrum, plectron",
+    715: "pickelhaube",
+    716: "picket fence, paling",
+    717: "pickup, pickup truck",
+    718: "pier",
+    719: "piggy bank, penny bank",
+    720: "pill bottle",
+    721: "pillow",
+    722: "ping-pong ball",
+    723: "pinwheel",
+    724: "pirate, pirate ship",
+    725: "pitcher, ewer",
+    726: "plane, carpenter's plane, woodworking plane",
+    727: "planetarium",
+    728: "plastic bag",
+    729: "plate rack",
+    730: "plow, plough",
+    731: "plunger, plumber's helper",
+    732: "Polaroid camera, Polaroid Land camera",
+    733: "pole",
+    734: "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    735: "poncho",
+    736: "pool table, billiard table, snooker table",
+    737: "pop bottle, soda bottle",
+    738: "pot, flowerpot",
+    739: "potter's wheel",
+    740: "power drill",
+    741: "prayer rug, prayer mat",
+    742: "printer",
+    743: "prison, prison house",
+    744: "projectile, missile",
+    745: "projector",
+    746: "puck, hockey puck",
+    747: "punching bag, punch bag, punching ball, punchball",
+    748: "purse",
+    749: "quill, quill pen",
+    750: "quilt, comforter, comfort, puff",
+    751: "racer, race car, racing car",
+    752: "racket, racquet",
+    753: "radiator",
+    754: "radio, wireless",
+    755: "radio telescope, radio reflector",
+    756: "rain barrel",
+    757: "recreational vehicle, RV, R.V.",
+    758: "reel",
+    759: "reflex camera",
+    760: "refrigerator, icebox",
+    761: "remote control, remote",
+    762: "restaurant, eating house, eating place, eatery",
+    763: "revolver, six-gun, six-shooter",
+    764: "rifle",
+    765: "rocking chair, rocker",
+    766: "rotisserie",
+    767: "rubber eraser, rubber, pencil eraser",
+    768: "rugby ball",
+    769: "rule, ruler",
+    770: "running shoe",
+    771: "safe",
+    772: "safety pin",
+    773: "saltshaker, salt shaker",
+    774: "sandal",
+    775: "sarong",
+    776: "sax, saxophone",
+    777: "scabbard",
+    778: "scale, weighing machine",
+    779: "school bus",
+    780: "schooner",
+    781: "scoreboard",
+    782: "screen, CRT screen",
+    783: "screw",
+    784: "screwdriver",
+    785: "seat belt, seatbelt",
+    786: "sewing machine",
+    787: "shield, buckler",
+    788: "shoe shop, shoe-shop, shoe store",
+    789: "shoji",
+    790: "shopping basket",
+    791: "shopping cart",
+    792: "shovel",
+    793: "shower cap",
+    794: "shower curtain",
+    795: "ski",
+    796: "ski mask",
+    797: "sleeping bag",
+    798: "slide rule, slipstick",
+    799: "sliding door",
+    800: "slot, one-armed bandit",
+    801: "snorkel",
+    802: "snowmobile",
+    803: "snowplow, snowplough",
+    804: "soap dispenser",
+    805: "soccer ball",
+    806: "sock",
+    807: "solar dish, solar collector, solar furnace",
+    808: "sombrero",
+    809: "soup bowl",
+    810: "space bar",
+    811: "space heater",
+    812: "space shuttle",
+    813: "spatula",
+    814: "speedboat",
+    815: "spider web, spider's web",
+    816: "spindle",
+    817: "sports car, sport car",
+    818: "spotlight, spot",
+    819: "stage",
+    820: "steam locomotive",
+    821: "steel arch bridge",
+    822: "steel drum",
+    823: "stethoscope",
+    824: "stole",
+    825: "stone wall",
+    826: "stopwatch, stop watch",
+    827: "stove",
+    828: "strainer",
+    829: "streetcar, tram, tramcar, trolley, trolley car",
+    830: "stretcher",
+    831: "studio couch, day bed",
+    832: "stupa, tope",
+    833: "submarine, pigboat, sub, U-boat",
+    834: "suit, suit of clothes",
+    835: "sundial",
+    836: "sunglass",
+    837: "sunglasses, dark glasses, shades",
+    838: "sunscreen, sunblock, sun blocker",
+    839: "suspension bridge",
+    840: "swab, swob, mop",
+    841: "sweatshirt",
+    842: "swimming trunks, bathing trunks",
+    843: "swing",
+    844: "switch, electric switch, electrical switch",
+    845: "syringe",
+    846: "table lamp",
+    847: "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    848: "tape player",
+    849: "teapot",
+    850: "teddy, teddy bear",
+    851: "television, television system",
+    852: "tennis ball",
+    853: "thatch, thatched roof",
+    854: "theater curtain, theatre curtain",
+    855: "thimble",
+    856: "thresher, thrasher, threshing machine",
+    857: "throne",
+    858: "tile roof",
+    859: "toaster",
+    860: "tobacco shop, tobacconist shop, tobacconist",
+    861: "toilet seat",
+    862: "torch",
+    863: "totem pole",
+    864: "tow truck, tow car, wrecker",
+    865: "toyshop",
+    866: "tractor",
+    867: "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    868: "tray",
+    869: "trench coat",
+    870: "tricycle, trike, velocipede",
+    871: "trimaran",
+    872: "tripod",
+    873: "triumphal arch",
+    874: "trolleybus, trolley coach, trackless trolley",
+    875: "trombone",
+    876: "tub, vat",
+    877: "turnstile",
+    878: "typewriter keyboard",
+    879: "umbrella",
+    880: "unicycle, monocycle",
+    881: "upright, upright piano",
+    882: "vacuum, vacuum cleaner",
+    883: "vase",
+    884: "vault",
+    885: "velvet",
+    886: "vending machine",
+    887: "vestment",
+    888: "viaduct",
+    889: "violin, fiddle",
+    890: "volleyball",
+    891: "waffle iron",
+    892: "wall clock",
+    893: "wallet, billfold, notecase, pocketbook",
+    894: "wardrobe, closet, press",
+    895: "warplane, military plane",
+    896: "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    897: "washer, automatic washer, washing machine",
+    898: "water bottle",
+    899: "water jug",
+    900: "water tower",
+    901: "whiskey jug",
+    902: "whistle",
+    903: "wig",
+    904: "window screen",
+    905: "window shade",
+    906: "Windsor tie",
+    907: "wine bottle",
+    908: "wing",
+    909: "wok",
+    910: "wooden spoon",
+    911: "wool, woolen, woollen",
+    912: "worm fence, snake fence, snake-rail fence, Virginia fence",
+    913: "wreck",
+    914: "yawl",
+    915: "yurt",
+    916: "web site, website, internet site, site",
+    917: "comic book",
+    918: "crossword puzzle, crossword",
+    919: "street sign",
+    920: "traffic light, traffic signal, stoplight",
+    921: "book jacket, dust cover, dust jacket, dust wrapper",
+    922: "menu",
+    923: "plate",
+    924: "guacamole",
+    925: "consomme",
+    926: "hot pot, hotpot",
+    927: "trifle",
+    928: "ice cream, icecream",
+    929: "ice lolly, lolly, lollipop, popsicle",
+    930: "French loaf",
+    931: "bagel, beigel",
+    932: "pretzel",
+    933: "cheeseburger",
+    934: "hotdog, hot dog, red hot",
+    935: "mashed potato",
+    936: "head cabbage",
+    937: "broccoli",
+    938: "cauliflower",
+    939: "zucchini, courgette",
+    940: "spaghetti squash",
+    941: "acorn squash",
+    942: "butternut squash",
+    943: "cucumber, cuke",
+    944: "artichoke, globe artichoke",
+    945: "bell pepper",
+    946: "cardoon",
+    947: "mushroom",
+    948: "Granny Smith",
+    949: "strawberry",
+    950: "orange",
+    951: "lemon",
+    952: "fig",
+    953: "pineapple, ananas",
+    954: "banana",
+    955: "jackfruit, jak, jack",
+    956: "custard apple",
+    957: "pomegranate",
+    958: "hay",
+    959: "carbonara",
+    960: "chocolate sauce, chocolate syrup",
+    961: "dough",
+    962: "meat loaf, meatloaf",
+    963: "pizza, pizza pie",
+    964: "potpie",
+    965: "burrito",
+    966: "red wine",
+    967: "espresso",
+    968: "cup",
+    969: "eggnog",
+    970: "alp",
+    971: "bubble",
+    972: "cliff, drop, drop-off",
+    973: "coral reef",
+    974: "geyser",
+    975: "lakeside, lakeshore",
+    976: "promontory, headland, head, foreland",
+    977: "sandbar, sand bar",
+    978: "seashore, coast, seacoast, sea-coast",
+    979: "valley, vale",
+    980: "volcano",
+    981: "ballplayer, baseball player",
+    982: "groom, bridegroom",
+    983: "scuba diver",
+    984: "rapeseed",
+    985: "daisy",
+    986: "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+    987: "corn",
+    988: "acorn",
+    989: "hip, rose hip, rosehip",
+    990: "buckeye, horse chestnut, conker",
+    991: "coral fungus",
+    992: "agaric",
+    993: "gyromitra",
+    994: "stinkhorn, carrion fungus",
+    995: "earthstar",
+    996: "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    997: "bolete",
+    998: "ear, spike, capitulum",
+    999: "toilet tissue, toilet paper, bathroom tissue",
+}
+
 
 # here we list all keys to be renamed (original name on the left, our name on the right)
 def create_rename_keys(config, base_model=False):
@@ -166,25 +1170,42 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
 
     # define HuggingFace configuration
     config = ViTConfig()
-    if vit_name == "vit_base_patch16_224":
-        config.num_labels = 1000
-    elif vit_name == "vit_base_patch32_224":
-        config.patch_size = 32
-        config.num_labels = 1000
-    elif vit_name == "vit_base_patch16_384":
-        config.image_size = 384
-        config.num_labels = 1000
-    elif vit_name == "vit_base_patch32_384":
-        config.image_size = 384
-        config.patch_size = 32
+    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
+    if vit_name[-5:] == "in21k":
+        config.num_labels = 21843
+        config.patch_size = int(vit_name[-12:-10])
+        config.image_size = int(vit_name[-9:-6])
+    else:
         config.num_labels = 1000
+        config.id2label = id2class
+        config.label2id = {v: k for k, v in id2class.items()}
+        config.patch_size = int(vit_name[-6:-4])
+        config.image_size = int(vit_name[-3:])
+    # size of the architecture
+    if vit_name[4:].startswith("small"):
+        config.hidden_size = 768
+        config.intermediate_size = -1
+        config.num_hidden_layers = 8
+        config.num_attention_heads = 8
+    if vit_name[4:].startswith("base"):
+        pass
+    elif vit_name[4:].startswith("large"):
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+    elif vit_name[4:].startswith("huge"):
+        config.hidden_size = 1280
+        config.intermediate_size = -1
+        config.num_hidden_layers = 32
+        config.num_attention_heads = 16
 
     # load original model from timm
-    vit = timm.create_model(vit_name, pretrained=True)
-    vit.eval()
+    timm_model = timm.create_model(vit_name, pretrained=True)
+    timm_model.eval()
 
     # load state_dict of original model, remove and rename some keys
-    state_dict = vit.state_dict()
+    state_dict = timm_model.state_dict()
     rename_keys = create_rename_keys(config, base_model)
     for src, dest in rename_keys:
         rename_key(state_dict, src, dest)
@@ -192,19 +1213,20 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     if base_model:
         remove_classification_head_(state_dict)
 
+    # load HuggingFace model
     model = ViTForImageClassification(config).eval()
     model.load_state_dict(state_dict)
 
     # Check logits on an image
     img = prepare_img(config.image_size)
-    logits = vit(img)
+    logits = timm_model(img)
     outputs = model(img)
 
     assert logits.shape == outputs.logits.shape
     assert torch.allclose(logits, outputs.logits, atol=1e-4)
 
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
+    print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
 
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index a2d9e39654096f..5fb1218ad5d297 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -58,6 +58,8 @@ def parse(x):
 
 # Based on timm implementation, which can be found here:
 # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+
+
 class ViTEmbeddings(nn.Module):
     """
     Construct the CLS token, position and patch embeddings.
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 8e36c9c3191b37..52477acfaacab4 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2393,11 +2393,6 @@ def __init__(self, *args, **kwargs):
         requires_pytorch(self)
 
 
-class ViTLayer:
-    def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
-
-
 class ViTModel:
     def __init__(self, *args, **kwargs):
         requires_pytorch(self)

From 472f96d71464ab9be020c661f0e869391ae49a62 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 23 Mar 2021 16:55:53 +0100
Subject: [PATCH 40/44] Rename self.self to self.attention

---
 .../models/vit/convert_vit_timm_to_pytorch.py    | 16 ++++++++++------
 src/transformers/models/vit/modeling_vit.py      | 16 ++++++++--------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index 5325ccdbaf132a..3d49e1540d5422 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -1110,20 +1110,24 @@ def read_in_q_k_v(state_dict, config, base_model=False):
         in_proj_weight = state_dict.pop("blocks." + str(i) + ".attn.qkv.weight")
         in_proj_bias = state_dict.pop("blocks." + str(i) + ".attn.qkv.bias")
         # next, add query, keys and values (in that order) to the state dict
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.query.weight"] = in_proj_weight[
+        state_dict["vit.encoder.layer." + str(i) + ".attention.attention.query.weight"] = in_proj_weight[
             : config.hidden_size, :
         ]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.key.weight"] = in_proj_weight[
+        state_dict["vit.encoder.layer." + str(i) + ".attention.attention.query.bias"] = in_proj_bias[
+            : config.hidden_size
+        ]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.attention.key.weight"] = in_proj_weight[
             config.hidden_size : config.hidden_size * 2, :
         ]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.key.bias"] = in_proj_bias[
+        state_dict["vit.encoder.layer." + str(i) + ".attention.attention.key.bias"] = in_proj_bias[
             config.hidden_size : config.hidden_size * 2
         ]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.value.weight"] = in_proj_weight[
+        state_dict["vit.encoder.layer." + str(i) + ".attention.attention.value.weight"] = in_proj_weight[
             -config.hidden_size :, :
         ]
-        state_dict["vit.encoder.layer." + str(i) + ".attention.self.value.bias"] = in_proj_bias[-config.hidden_size :]
+        state_dict["vit.encoder.layer." + str(i) + ".attention.attention.value.bias"] = in_proj_bias[
+            -config.hidden_size :
+        ]
 
     # to do: add base model support
     if base_model:
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 5fb1218ad5d297..1b59812d6f5b5c 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -201,7 +201,7 @@ def forward(self, hidden_states, input_tensor):
 class ViTAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.self = ViTSelfAttention(config)
+        self.attention = ViTSelfAttention(config)
         self.output = ViTSelfOutput(config)
         self.pruned_heads = set()
 
@@ -209,18 +209,18 @@ def prune_heads(self, heads):
         if len(heads) == 0:
             return
         heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
         )
 
         # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
         self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
 
         # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def forward(
@@ -229,7 +229,7 @@ def forward(
         head_mask=None,
         output_attentions=False,
     ):
-        self_outputs = self.self(
+        self_outputs = self.attention(
             hidden_states,
             head_mask,
             output_attentions,

From e790c1d2bb6c93a8fd0e1314ef6a300f6e80a0e1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 24 Mar 2021 11:10:52 +0100
Subject: [PATCH 41/44] Add pooler option to ViTForImageClassification, improve
 docs

---
 docs/source/model_doc/vit.rst                 | 17 ++++++----------
 .../models/vit/configuration_vit.py           |  4 ++++
 .../models/vit/convert_vit_timm_to_pytorch.py | 12 ++++++++++-
 src/transformers/models/vit/modeling_vit.py   | 20 +++++++++++++++++--
 tests/test_modeling_vit.py                    |  2 +-
 5 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
index 9e98136f06a817..46aef9484427a3 100644
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -1,17 +1,12 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
+Vision Transformer (ViT)
+-----------------------------------------------------------------------------------------------------------------------
 
-        http://www.apache.org/licenses/LICENSE-2.0
+.. note::
 
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
+    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+    breaking changes to fix them in the future. Note that the API of :obj:`ViTFeatureExtractor` will change in the
+    future, when more frameworks than just PyTorch will be supported.
 
-Vision Transformer (ViT)
------------------------------------------------------------------------------------------------------------------------
 
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index 5e53df4cddfd7d..2364cf8230f238 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -65,6 +65,8 @@ class ViTConfig(PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
             The number of input channels.
+        use_pooler (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to use a pooler (linear layer + tanh) before calculating the classification logits.
 
 
     Example::
@@ -97,6 +99,7 @@ def __init__(
         image_size=224,
         patch_size=16,
         num_channels=3,
+        use_pooler=False,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -114,3 +117,4 @@ def __init__(
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
+        self.use_pooler = use_pooler
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index 3d49e1540d5422..49f05b1f354b12 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -1085,6 +1085,15 @@ def create_rename_keys(config, base_model=False):
         ]
     )
 
+    # pooler
+    if config.use_pooler:
+        rename_keys.extend(
+            [
+                ("pre_logits.fc.weight", "pooler.dense.weight"),
+                ("pre_logits.fc.bias", "pooler.dense.bias"),
+            ]
+        )
+
     # classification head
     rename_keys.extend(
         [
@@ -1179,6 +1188,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
         config.num_labels = 21843
         config.patch_size = int(vit_name[-12:-10])
         config.image_size = int(vit_name[-9:-6])
+        config.use_pooler = True
     else:
         config.num_labels = 1000
         config.id2label = id2class
@@ -1227,7 +1237,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     outputs = model(img)
 
     assert logits.shape == outputs.logits.shape
-    assert torch.allclose(logits, outputs.logits, atol=1e-4)
+    assert torch.allclose(logits, outputs.logits, atol=1e-3)
 
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 1b59812d6f5b5c..cca5bfd1fb6ed1 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -557,6 +557,7 @@ def __init__(self, config):
         # Classifier head
         self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = ViTPooler(config) if config.use_pooler else None
 
         self.init_weights()
 
@@ -582,7 +583,6 @@ def forward(
         Examples::
 
             >>> from transformers import ViTFeatureExtractor, ViTForImageClassification
-            >>> from datasets import load_dataset
             >>> from PIL import Image
             >>> import requests
 
@@ -610,7 +610,8 @@ def forward(
 
         sequence_output = outputs[0]
 
-        sequence_output = self.layernorm(sequence_output[:, 0, :])
+        sequence_output = self.layernorm(sequence_output)
+        sequence_output = self.pooler(sequence_output) if self.pooler is not None else sequence_output[:, 0]
         logits = self.classifier(sequence_output)
 
         loss = None
@@ -633,3 +634,18 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+class ViTPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index da3b1e9e6e27f5..cff9d19d323a0a 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -348,7 +348,7 @@ def default_feature_extractor(self):
     @slow
     def test_inference_image_classification_head(self):
         # TODO: replace namespace to google
-        model = ViTForImageClassification.from_pretrained("nielsr/vit-base-patch16-224").to(torch_device)
+        model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()

From 8b95a1ebb43a3b65131843fe2545a02f85190547 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 24 Mar 2021 13:35:33 +0100
Subject: [PATCH 42/44] Add ViTFeatureExtractor to conversion script

---
 .../models/vit/convert_vit_timm_to_pytorch.py            | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index 49f05b1f354b12..f682e609769e3b 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -24,7 +24,7 @@
 
 import requests
 import timm
-from transformers import ViTConfig, ViTForImageClassification
+from transformers import ViTConfig, ViTFeatureExtractor, ViTForImageClassification
 from transformers.utils import logging
 
 
@@ -1210,7 +1210,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
         config.num_attention_heads = 16
     elif vit_name[4:].startswith("huge"):
         config.hidden_size = 1280
-        config.intermediate_size = -1
+        config.intermediate_size = 5120
         config.num_hidden_layers = 32
         config.num_attention_heads = 16
 
@@ -1239,9 +1239,14 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     assert logits.shape == outputs.logits.shape
     assert torch.allclose(logits, outputs.logits, atol=1e-3)
 
+    # load feature extractor and set size
+    feature_extractor = ViTFeatureExtractor(size=config.image_size)
+
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":

From 37ae119ae0f8c9a91023d76831491a5744588086 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 24 Mar 2021 14:21:14 +0100
Subject: [PATCH 43/44] Add copyright

---
 docs/source/model_doc/vit.rst                    | 16 ++++++++++++++--
 .../models/vit/convert_vit_timm_to_pytorch.py    |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
index 46aef9484427a3..8d2eae027de8cf 100644
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -1,11 +1,23 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Vision Transformer (ViT)
 -----------------------------------------------------------------------------------------------------------------------
 
 .. note::
 
     This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-    breaking changes to fix them in the future. Note that the API of :obj:`ViTFeatureExtractor` will change in the
-    future, when more frameworks than just PyTorch will be supported.
+    breaking changes to fix it in the future. Note that the API of :obj:`ViTFeatureExtractor` will change in the
+    future, when more frameworks than just PyTorch are supported.
 
 
 Overview
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index f682e609769e3b..afe9a5cb45f171 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -1198,7 +1198,7 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
     # size of the architecture
     if vit_name[4:].startswith("small"):
         config.hidden_size = 768
-        config.intermediate_size = -1
+        config.intermediate_size = 2304
         config.num_hidden_layers = 8
         config.num_attention_heads = 8
     if vit_name[4:].startswith("base"):

From c6c0f2721b68241efec69ed8e201b3d5f9b2220b Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 26 Mar 2021 11:01:50 +0100
Subject: [PATCH 44/44] Address additional comments

---
 docs/source/model_doc/vit.rst                 |    4 +-
 src/transformers/__init__.py                  |    7 +-
 src/transformers/models/auto/__init__.py      |    2 +
 src/transformers/models/vit/__init__.py       |    2 -
 .../models/vit/convert_vit_timm_to_pytorch.py | 1015 +----------------
 .../models/vit/feature_extraction_vit.py      |   25 +-
 src/transformers/utils/imagenet_classes.py    | 1003 ++++++++++++++++
 tests/test_modeling_vit.py                    |    6 +-
 8 files changed, 1031 insertions(+), 1033 deletions(-)
 create mode 100644 src/transformers/utils/imagenet_classes.py

diff --git a/docs/source/model_doc/vit.rst b/docs/source/model_doc/vit.rst
index 8d2eae027de8cf..03ce55e2f99599 100644
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -16,8 +16,8 @@ Vision Transformer (ViT)
 .. note::
 
     This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-    breaking changes to fix it in the future. Note that the API of :obj:`ViTFeatureExtractor` will change in the
-    future, when more frameworks than just PyTorch are supported.
+    breaking changes to fix it in the future. Note that the API of :class:`~transformers.ViTFeatureExtractor` will
+    change in the future, when more frameworks than just PyTorch are supported.
 
 
 Overview
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8b760598953d34..1417a0f0e41c08 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1546,9 +1546,6 @@
     else:
         from .utils.dummy_tokenizers_objects import *
 
-    if is_torchvision_available():
-        from .models.vit import ViTFeatureExtractor
-
     # Modeling
     if is_torch_available():
 
@@ -2309,6 +2306,10 @@
         # Import the same objects as dummies to get them in the namespace.
         # They will raise an import error if the user tries to instantiate / use them.
         from .utils.dummy_flax_objects import *
+
+    if is_torchvision_available():
+        from .models.vit import ViTFeatureExtractor
+
 else:
     import importlib
     import os
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 1226a0b8370a59..0a47a6cb2b806a 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -43,6 +43,7 @@
         "MODEL_WITH_LM_HEAD_MAPPING",
         "AutoModel",
         "AutoModelForCausalLM",
+        "AutoModelForImageClassification",
         "AutoModelForMaskedLM",
         "AutoModelForMultipleChoice",
         "AutoModelForNextSentencePrediction",
@@ -105,6 +106,7 @@
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
             AutoModelForCausalLM,
+            AutoModelForImageClassification,
             AutoModelForMaskedLM,
             AutoModelForMultipleChoice,
             AutoModelForNextSentencePrediction,
diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
index c155831ef1d1ff..3ca8360af2ad1a 100644
--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -31,7 +31,6 @@
     _import_structure["modeling_vit"] = [
         "VIT_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ViTForImageClassification",
-        "ViTLayer",
         "ViTModel",
         "ViTPreTrainedModel",
     ]
@@ -47,7 +46,6 @@
         from .modeling_vit import (
             VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViTForImageClassification,
-            ViTLayer,
             ViTModel,
             ViTPreTrainedModel,
         )
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index afe9a5cb45f171..f481e08f04b1b9 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -19,1022 +19,19 @@
 from pathlib import Path
 
 import torch
-import torchvision.transforms as T
 from PIL import Image
+from torchvision.transforms import Compose, Normalize, Resize, ToTensor
 
 import requests
 import timm
 from transformers import ViTConfig, ViTFeatureExtractor, ViTForImageClassification
 from transformers.utils import logging
+from transformers.utils.imagenet_classes import id2label
 
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
 
-# ImageNet 2012 id's to class names
-id2class = {
-    0: "tench, Tinca tinca",
-    1: "goldfish, Carassius auratus",
-    2: "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
-    3: "tiger shark, Galeocerdo cuvieri",
-    4: "hammerhead, hammerhead shark",
-    5: "electric ray, crampfish, numbfish, torpedo",
-    6: "stingray",
-    7: "cock",
-    8: "hen",
-    9: "ostrich, Struthio camelus",
-    10: "brambling, Fringilla montifringilla",
-    11: "goldfinch, Carduelis carduelis",
-    12: "house finch, linnet, Carpodacus mexicanus",
-    13: "junco, snowbird",
-    14: "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
-    15: "robin, American robin, Turdus migratorius",
-    16: "bulbul",
-    17: "jay",
-    18: "magpie",
-    19: "chickadee",
-    20: "water ouzel, dipper",
-    21: "kite",
-    22: "bald eagle, American eagle, Haliaeetus leucocephalus",
-    23: "vulture",
-    24: "great grey owl, great gray owl, Strix nebulosa",
-    25: "European fire salamander, Salamandra salamandra",
-    26: "common newt, Triturus vulgaris",
-    27: "eft",
-    28: "spotted salamander, Ambystoma maculatum",
-    29: "axolotl, mud puppy, Ambystoma mexicanum",
-    30: "bullfrog, Rana catesbeiana",
-    31: "tree frog, tree-frog",
-    32: "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
-    33: "loggerhead, loggerhead turtle, Caretta caretta",
-    34: "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
-    35: "mud turtle",
-    36: "terrapin",
-    37: "box turtle, box tortoise",
-    38: "banded gecko",
-    39: "common iguana, iguana, Iguana iguana",
-    40: "American chameleon, anole, Anolis carolinensis",
-    41: "whiptail, whiptail lizard",
-    42: "agama",
-    43: "frilled lizard, Chlamydosaurus kingi",
-    44: "alligator lizard",
-    45: "Gila monster, Heloderma suspectum",
-    46: "green lizard, Lacerta viridis",
-    47: "African chameleon, Chamaeleo chamaeleon",
-    48: "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
-    49: "African crocodile, Nile crocodile, Crocodylus niloticus",
-    50: "American alligator, Alligator mississipiensis",
-    51: "triceratops",
-    52: "thunder snake, worm snake, Carphophis amoenus",
-    53: "ringneck snake, ring-necked snake, ring snake",
-    54: "hognose snake, puff adder, sand viper",
-    55: "green snake, grass snake",
-    56: "king snake, kingsnake",
-    57: "garter snake, grass snake",
-    58: "water snake",
-    59: "vine snake",
-    60: "night snake, Hypsiglena torquata",
-    61: "boa constrictor, Constrictor constrictor",
-    62: "rock python, rock snake, Python sebae",
-    63: "Indian cobra, Naja naja",
-    64: "green mamba",
-    65: "sea snake",
-    66: "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
-    67: "diamondback, diamondback rattlesnake, Crotalus adamanteus",
-    68: "sidewinder, horned rattlesnake, Crotalus cerastes",
-    69: "trilobite",
-    70: "harvestman, daddy longlegs, Phalangium opilio",
-    71: "scorpion",
-    72: "black and gold garden spider, Argiope aurantia",
-    73: "barn spider, Araneus cavaticus",
-    74: "garden spider, Aranea diademata",
-    75: "black widow, Latrodectus mactans",
-    76: "tarantula",
-    77: "wolf spider, hunting spider",
-    78: "tick",
-    79: "centipede",
-    80: "black grouse",
-    81: "ptarmigan",
-    82: "ruffed grouse, partridge, Bonasa umbellus",
-    83: "prairie chicken, prairie grouse, prairie fowl",
-    84: "peacock",
-    85: "quail",
-    86: "partridge",
-    87: "African grey, African gray, Psittacus erithacus",
-    88: "macaw",
-    89: "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
-    90: "lorikeet",
-    91: "coucal",
-    92: "bee eater",
-    93: "hornbill",
-    94: "hummingbird",
-    95: "jacamar",
-    96: "toucan",
-    97: "drake",
-    98: "red-breasted merganser, Mergus serrator",
-    99: "goose",
-    100: "black swan, Cygnus atratus",
-    101: "tusker",
-    102: "echidna, spiny anteater, anteater",
-    103: "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
-    104: "wallaby, brush kangaroo",
-    105: "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
-    106: "wombat",
-    107: "jellyfish",
-    108: "sea anemone, anemone",
-    109: "brain coral",
-    110: "flatworm, platyhelminth",
-    111: "nematode, nematode worm, roundworm",
-    112: "conch",
-    113: "snail",
-    114: "slug",
-    115: "sea slug, nudibranch",
-    116: "chiton, coat-of-mail shell, sea cradle, polyplacophore",
-    117: "chambered nautilus, pearly nautilus, nautilus",
-    118: "Dungeness crab, Cancer magister",
-    119: "rock crab, Cancer irroratus",
-    120: "fiddler crab",
-    121: "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
-    122: "American lobster, Northern lobster, Maine lobster, Homarus americanus",
-    123: "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
-    124: "crayfish, crawfish, crawdad, crawdaddy",
-    125: "hermit crab",
-    126: "isopod",
-    127: "white stork, Ciconia ciconia",
-    128: "black stork, Ciconia nigra",
-    129: "spoonbill",
-    130: "flamingo",
-    131: "little blue heron, Egretta caerulea",
-    132: "American egret, great white heron, Egretta albus",
-    133: "bittern",
-    134: "crane",
-    135: "limpkin, Aramus pictus",
-    136: "European gallinule, Porphyrio porphyrio",
-    137: "American coot, marsh hen, mud hen, water hen, Fulica americana",
-    138: "bustard",
-    139: "ruddy turnstone, Arenaria interpres",
-    140: "red-backed sandpiper, dunlin, Erolia alpina",
-    141: "redshank, Tringa totanus",
-    142: "dowitcher",
-    143: "oystercatcher, oyster catcher",
-    144: "pelican",
-    145: "king penguin, Aptenodytes patagonica",
-    146: "albatross, mollymawk",
-    147: "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
-    148: "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
-    149: "dugong, Dugong dugon",
-    150: "sea lion",
-    151: "Chihuahua",
-    152: "Japanese spaniel",
-    153: "Maltese dog, Maltese terrier, Maltese",
-    154: "Pekinese, Pekingese, Peke",
-    155: "Shih-Tzu",
-    156: "Blenheim spaniel",
-    157: "papillon",
-    158: "toy terrier",
-    159: "Rhodesian ridgeback",
-    160: "Afghan hound, Afghan",
-    161: "basset, basset hound",
-    162: "beagle",
-    163: "bloodhound, sleuthhound",
-    164: "bluetick",
-    165: "black-and-tan coonhound",
-    166: "Walker hound, Walker foxhound",
-    167: "English foxhound",
-    168: "redbone",
-    169: "borzoi, Russian wolfhound",
-    170: "Irish wolfhound",
-    171: "Italian greyhound",
-    172: "whippet",
-    173: "Ibizan hound, Ibizan Podenco",
-    174: "Norwegian elkhound, elkhound",
-    175: "otterhound, otter hound",
-    176: "Saluki, gazelle hound",
-    177: "Scottish deerhound, deerhound",
-    178: "Weimaraner",
-    179: "Staffordshire bullterrier, Staffordshire bull terrier",
-    180: "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
-    181: "Bedlington terrier",
-    182: "Border terrier",
-    183: "Kerry blue terrier",
-    184: "Irish terrier",
-    185: "Norfolk terrier",
-    186: "Norwich terrier",
-    187: "Yorkshire terrier",
-    188: "wire-haired fox terrier",
-    189: "Lakeland terrier",
-    190: "Sealyham terrier, Sealyham",
-    191: "Airedale, Airedale terrier",
-    192: "cairn, cairn terrier",
-    193: "Australian terrier",
-    194: "Dandie Dinmont, Dandie Dinmont terrier",
-    195: "Boston bull, Boston terrier",
-    196: "miniature schnauzer",
-    197: "giant schnauzer",
-    198: "standard schnauzer",
-    199: "Scotch terrier, Scottish terrier, Scottie",
-    200: "Tibetan terrier, chrysanthemum dog",
-    201: "silky terrier, Sydney silky",
-    202: "soft-coated wheaten terrier",
-    203: "West Highland white terrier",
-    204: "Lhasa, Lhasa apso",
-    205: "flat-coated retriever",
-    206: "curly-coated retriever",
-    207: "golden retriever",
-    208: "Labrador retriever",
-    209: "Chesapeake Bay retriever",
-    210: "German short-haired pointer",
-    211: "vizsla, Hungarian pointer",
-    212: "English setter",
-    213: "Irish setter, red setter",
-    214: "Gordon setter",
-    215: "Brittany spaniel",
-    216: "clumber, clumber spaniel",
-    217: "English springer, English springer spaniel",
-    218: "Welsh springer spaniel",
-    219: "cocker spaniel, English cocker spaniel, cocker",
-    220: "Sussex spaniel",
-    221: "Irish water spaniel",
-    222: "kuvasz",
-    223: "schipperke",
-    224: "groenendael",
-    225: "malinois",
-    226: "briard",
-    227: "kelpie",
-    228: "komondor",
-    229: "Old English sheepdog, bobtail",
-    230: "Shetland sheepdog, Shetland sheep dog, Shetland",
-    231: "collie",
-    232: "Border collie",
-    233: "Bouvier des Flandres, Bouviers des Flandres",
-    234: "Rottweiler",
-    235: "German shepherd, German shepherd dog, German police dog, alsatian",
-    236: "Doberman, Doberman pinscher",
-    237: "miniature pinscher",
-    238: "Greater Swiss Mountain dog",
-    239: "Bernese mountain dog",
-    240: "Appenzeller",
-    241: "EntleBucher",
-    242: "boxer",
-    243: "bull mastiff",
-    244: "Tibetan mastiff",
-    245: "French bulldog",
-    246: "Great Dane",
-    247: "Saint Bernard, St Bernard",
-    248: "Eskimo dog, husky",
-    249: "malamute, malemute, Alaskan malamute",
-    250: "Siberian husky",
-    251: "dalmatian, coach dog, carriage dog",
-    252: "affenpinscher, monkey pinscher, monkey dog",
-    253: "basenji",
-    254: "pug, pug-dog",
-    255: "Leonberg",
-    256: "Newfoundland, Newfoundland dog",
-    257: "Great Pyrenees",
-    258: "Samoyed, Samoyede",
-    259: "Pomeranian",
-    260: "chow, chow chow",
-    261: "keeshond",
-    262: "Brabancon griffon",
-    263: "Pembroke, Pembroke Welsh corgi",
-    264: "Cardigan, Cardigan Welsh corgi",
-    265: "toy poodle",
-    266: "miniature poodle",
-    267: "standard poodle",
-    268: "Mexican hairless",
-    269: "timber wolf, grey wolf, gray wolf, Canis lupus",
-    270: "white wolf, Arctic wolf, Canis lupus tundrarum",
-    271: "red wolf, maned wolf, Canis rufus, Canis niger",
-    272: "coyote, prairie wolf, brush wolf, Canis latrans",
-    273: "dingo, warrigal, warragal, Canis dingo",
-    274: "dhole, Cuon alpinus",
-    275: "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
-    276: "hyena, hyaena",
-    277: "red fox, Vulpes vulpes",
-    278: "kit fox, Vulpes macrotis",
-    279: "Arctic fox, white fox, Alopex lagopus",
-    280: "grey fox, gray fox, Urocyon cinereoargenteus",
-    281: "tabby, tabby cat",
-    282: "tiger cat",
-    283: "Persian cat",
-    284: "Siamese cat, Siamese",
-    285: "Egyptian cat",
-    286: "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
-    287: "lynx, catamount",
-    288: "leopard, Panthera pardus",
-    289: "snow leopard, ounce, Panthera uncia",
-    290: "jaguar, panther, Panthera onca, Felis onca",
-    291: "lion, king of beasts, Panthera leo",
-    292: "tiger, Panthera tigris",
-    293: "cheetah, chetah, Acinonyx jubatus",
-    294: "brown bear, bruin, Ursus arctos",
-    295: "American black bear, black bear, Ursus americanus, Euarctos americanus",
-    296: "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
-    297: "sloth bear, Melursus ursinus, Ursus ursinus",
-    298: "mongoose",
-    299: "meerkat, mierkat",
-    300: "tiger beetle",
-    301: "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
-    302: "ground beetle, carabid beetle",
-    303: "long-horned beetle, longicorn, longicorn beetle",
-    304: "leaf beetle, chrysomelid",
-    305: "dung beetle",
-    306: "rhinoceros beetle",
-    307: "weevil",
-    308: "fly",
-    309: "bee",
-    310: "ant, emmet, pismire",
-    311: "grasshopper, hopper",
-    312: "cricket",
-    313: "walking stick, walkingstick, stick insect",
-    314: "cockroach, roach",
-    315: "mantis, mantid",
-    316: "cicada, cicala",
-    317: "leafhopper",
-    318: "lacewing, lacewing fly",
-    319: "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
-    320: "damselfly",
-    321: "admiral",
-    322: "ringlet, ringlet butterfly",
-    323: "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
-    324: "cabbage butterfly",
-    325: "sulphur butterfly, sulfur butterfly",
-    326: "lycaenid, lycaenid butterfly",
-    327: "starfish, sea star",
-    328: "sea urchin",
-    329: "sea cucumber, holothurian",
-    330: "wood rabbit, cottontail, cottontail rabbit",
-    331: "hare",
-    332: "Angora, Angora rabbit",
-    333: "hamster",
-    334: "porcupine, hedgehog",
-    335: "fox squirrel, eastern fox squirrel, Sciurus niger",
-    336: "marmot",
-    337: "beaver",
-    338: "guinea pig, Cavia cobaya",
-    339: "sorrel",
-    340: "zebra",
-    341: "hog, pig, grunter, squealer, Sus scrofa",
-    342: "wild boar, boar, Sus scrofa",
-    343: "warthog",
-    344: "hippopotamus, hippo, river horse, Hippopotamus amphibius",
-    345: "ox",
-    346: "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
-    347: "bison",
-    348: "ram, tup",
-    349: "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
-    350: "ibex, Capra ibex",
-    351: "hartebeest",
-    352: "impala, Aepyceros melampus",
-    353: "gazelle",
-    354: "Arabian camel, dromedary, Camelus dromedarius",
-    355: "llama",
-    356: "weasel",
-    357: "mink",
-    358: "polecat, fitch, foulmart, foumart, Mustela putorius",
-    359: "black-footed ferret, ferret, Mustela nigripes",
-    360: "otter",
-    361: "skunk, polecat, wood pussy",
-    362: "badger",
-    363: "armadillo",
-    364: "three-toed sloth, ai, Bradypus tridactylus",
-    365: "orangutan, orang, orangutang, Pongo pygmaeus",
-    366: "gorilla, Gorilla gorilla",
-    367: "chimpanzee, chimp, Pan troglodytes",
-    368: "gibbon, Hylobates lar",
-    369: "siamang, Hylobates syndactylus, Symphalangus syndactylus",
-    370: "guenon, guenon monkey",
-    371: "patas, hussar monkey, Erythrocebus patas",
-    372: "baboon",
-    373: "macaque",
-    374: "langur",
-    375: "colobus, colobus monkey",
-    376: "proboscis monkey, Nasalis larvatus",
-    377: "marmoset",
-    378: "capuchin, ringtail, Cebus capucinus",
-    379: "howler monkey, howler",
-    380: "titi, titi monkey",
-    381: "spider monkey, Ateles geoffroyi",
-    382: "squirrel monkey, Saimiri sciureus",
-    383: "Madagascar cat, ring-tailed lemur, Lemur catta",
-    384: "indri, indris, Indri indri, Indri brevicaudatus",
-    385: "Indian elephant, Elephas maximus",
-    386: "African elephant, Loxodonta africana",
-    387: "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
-    388: "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
-    389: "barracouta, snoek",
-    390: "eel",
-    391: "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
-    392: "rock beauty, Holocanthus tricolor",
-    393: "anemone fish",
-    394: "sturgeon",
-    395: "gar, garfish, garpike, billfish, Lepisosteus osseus",
-    396: "lionfish",
-    397: "puffer, pufferfish, blowfish, globefish",
-    398: "abacus",
-    399: "abaya",
-    400: "academic gown, academic robe, judge's robe",
-    401: "accordion, piano accordion, squeeze box",
-    402: "acoustic guitar",
-    403: "aircraft carrier, carrier, flattop, attack aircraft carrier",
-    404: "airliner",
-    405: "airship, dirigible",
-    406: "altar",
-    407: "ambulance",
-    408: "amphibian, amphibious vehicle",
-    409: "analog clock",
-    410: "apiary, bee house",
-    411: "apron",
-    412: "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
-    413: "assault rifle, assault gun",
-    414: "backpack, back pack, knapsack, packsack, rucksack, haversack",
-    415: "bakery, bakeshop, bakehouse",
-    416: "balance beam, beam",
-    417: "balloon",
-    418: "ballpoint, ballpoint pen, ballpen, Biro",
-    419: "Band Aid",
-    420: "banjo",
-    421: "bannister, banister, balustrade, balusters, handrail",
-    422: "barbell",
-    423: "barber chair",
-    424: "barbershop",
-    425: "barn",
-    426: "barometer",
-    427: "barrel, cask",
-    428: "barrow, garden cart, lawn cart, wheelbarrow",
-    429: "baseball",
-    430: "basketball",
-    431: "bassinet",
-    432: "bassoon",
-    433: "bathing cap, swimming cap",
-    434: "bath towel",
-    435: "bathtub, bathing tub, bath, tub",
-    436: "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
-    437: "beacon, lighthouse, beacon light, pharos",
-    438: "beaker",
-    439: "bearskin, busby, shako",
-    440: "beer bottle",
-    441: "beer glass",
-    442: "bell cote, bell cot",
-    443: "bib",
-    444: "bicycle-built-for-two, tandem bicycle, tandem",
-    445: "bikini, two-piece",
-    446: "binder, ring-binder",
-    447: "binoculars, field glasses, opera glasses",
-    448: "birdhouse",
-    449: "boathouse",
-    450: "bobsled, bobsleigh, bob",
-    451: "bolo tie, bolo, bola tie, bola",
-    452: "bonnet, poke bonnet",
-    453: "bookcase",
-    454: "bookshop, bookstore, bookstall",
-    455: "bottlecap",
-    456: "bow",
-    457: "bow tie, bow-tie, bowtie",
-    458: "brass, memorial tablet, plaque",
-    459: "brassiere, bra, bandeau",
-    460: "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
-    461: "breastplate, aegis, egis",
-    462: "broom",
-    463: "bucket, pail",
-    464: "buckle",
-    465: "bulletproof vest",
-    466: "bullet train, bullet",
-    467: "butcher shop, meat market",
-    468: "cab, hack, taxi, taxicab",
-    469: "caldron, cauldron",
-    470: "candle, taper, wax light",
-    471: "cannon",
-    472: "canoe",
-    473: "can opener, tin opener",
-    474: "cardigan",
-    475: "car mirror",
-    476: "carousel, carrousel, merry-go-round, roundabout, whirligig",
-    477: "carpenter's kit, tool kit",
-    478: "carton",
-    479: "car wheel",
-    480: "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
-    481: "cassette",
-    482: "cassette player",
-    483: "castle",
-    484: "catamaran",
-    485: "CD player",
-    486: "cello, violoncello",
-    487: "cellular telephone, cellular phone, cellphone, cell, mobile phone",
-    488: "chain",
-    489: "chainlink fence",
-    490: "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
-    491: "chain saw, chainsaw",
-    492: "chest",
-    493: "chiffonier, commode",
-    494: "chime, bell, gong",
-    495: "china cabinet, china closet",
-    496: "Christmas stocking",
-    497: "church, church building",
-    498: "cinema, movie theater, movie theatre, movie house, picture palace",
-    499: "cleaver, meat cleaver, chopper",
-    500: "cliff dwelling",
-    501: "cloak",
-    502: "clog, geta, patten, sabot",
-    503: "cocktail shaker",
-    504: "coffee mug",
-    505: "coffeepot",
-    506: "coil, spiral, volute, whorl, helix",
-    507: "combination lock",
-    508: "computer keyboard, keypad",
-    509: "confectionery, confectionary, candy store",
-    510: "container ship, containership, container vessel",
-    511: "convertible",
-    512: "corkscrew, bottle screw",
-    513: "cornet, horn, trumpet, trump",
-    514: "cowboy boot",
-    515: "cowboy hat, ten-gallon hat",
-    516: "cradle",
-    517: "crane",
-    518: "crash helmet",
-    519: "crate",
-    520: "crib, cot",
-    521: "Crock Pot",
-    522: "croquet ball",
-    523: "crutch",
-    524: "cuirass",
-    525: "dam, dike, dyke",
-    526: "desk",
-    527: "desktop computer",
-    528: "dial telephone, dial phone",
-    529: "diaper, nappy, napkin",
-    530: "digital clock",
-    531: "digital watch",
-    532: "dining table, board",
-    533: "dishrag, dishcloth",
-    534: "dishwasher, dish washer, dishwashing machine",
-    535: "disk brake, disc brake",
-    536: "dock, dockage, docking facility",
-    537: "dogsled, dog sled, dog sleigh",
-    538: "dome",
-    539: "doormat, welcome mat",
-    540: "drilling platform, offshore rig",
-    541: "drum, membranophone, tympan",
-    542: "drumstick",
-    543: "dumbbell",
-    544: "Dutch oven",
-    545: "electric fan, blower",
-    546: "electric guitar",
-    547: "electric locomotive",
-    548: "entertainment center",
-    549: "envelope",
-    550: "espresso maker",
-    551: "face powder",
-    552: "feather boa, boa",
-    553: "file, file cabinet, filing cabinet",
-    554: "fireboat",
-    555: "fire engine, fire truck",
-    556: "fire screen, fireguard",
-    557: "flagpole, flagstaff",
-    558: "flute, transverse flute",
-    559: "folding chair",
-    560: "football helmet",
-    561: "forklift",
-    562: "fountain",
-    563: "fountain pen",
-    564: "four-poster",
-    565: "freight car",
-    566: "French horn, horn",
-    567: "frying pan, frypan, skillet",
-    568: "fur coat",
-    569: "garbage truck, dustcart",
-    570: "gasmask, respirator, gas helmet",
-    571: "gas pump, gasoline pump, petrol pump, island dispenser",
-    572: "goblet",
-    573: "go-kart",
-    574: "golf ball",
-    575: "golfcart, golf cart",
-    576: "gondola",
-    577: "gong, tam-tam",
-    578: "gown",
-    579: "grand piano, grand",
-    580: "greenhouse, nursery, glasshouse",
-    581: "grille, radiator grille",
-    582: "grocery store, grocery, food market, market",
-    583: "guillotine",
-    584: "hair slide",
-    585: "hair spray",
-    586: "half track",
-    587: "hammer",
-    588: "hamper",
-    589: "hand blower, blow dryer, blow drier, hair dryer, hair drier",
-    590: "hand-held computer, hand-held microcomputer",
-    591: "handkerchief, hankie, hanky, hankey",
-    592: "hard disc, hard disk, fixed disk",
-    593: "harmonica, mouth organ, harp, mouth harp",
-    594: "harp",
-    595: "harvester, reaper",
-    596: "hatchet",
-    597: "holster",
-    598: "home theater, home theatre",
-    599: "honeycomb",
-    600: "hook, claw",
-    601: "hoopskirt, crinoline",
-    602: "horizontal bar, high bar",
-    603: "horse cart, horse-cart",
-    604: "hourglass",
-    605: "iPod",
-    606: "iron, smoothing iron",
-    607: "jack-o'-lantern",
-    608: "jean, blue jean, denim",
-    609: "jeep, landrover",
-    610: "jersey, T-shirt, tee shirt",
-    611: "jigsaw puzzle",
-    612: "jinrikisha, ricksha, rickshaw",
-    613: "joystick",
-    614: "kimono",
-    615: "knee pad",
-    616: "knot",
-    617: "lab coat, laboratory coat",
-    618: "ladle",
-    619: "lampshade, lamp shade",
-    620: "laptop, laptop computer",
-    621: "lawn mower, mower",
-    622: "lens cap, lens cover",
-    623: "letter opener, paper knife, paperknife",
-    624: "library",
-    625: "lifeboat",
-    626: "lighter, light, igniter, ignitor",
-    627: "limousine, limo",
-    628: "liner, ocean liner",
-    629: "lipstick, lip rouge",
-    630: "Loafer",
-    631: "lotion",
-    632: "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
-    633: "loupe, jeweler's loupe",
-    634: "lumbermill, sawmill",
-    635: "magnetic compass",
-    636: "mailbag, postbag",
-    637: "mailbox, letter box",
-    638: "maillot",
-    639: "maillot, tank suit",
-    640: "manhole cover",
-    641: "maraca",
-    642: "marimba, xylophone",
-    643: "mask",
-    644: "matchstick",
-    645: "maypole",
-    646: "maze, labyrinth",
-    647: "measuring cup",
-    648: "medicine chest, medicine cabinet",
-    649: "megalith, megalithic structure",
-    650: "microphone, mike",
-    651: "microwave, microwave oven",
-    652: "military uniform",
-    653: "milk can",
-    654: "minibus",
-    655: "miniskirt, mini",
-    656: "minivan",
-    657: "missile",
-    658: "mitten",
-    659: "mixing bowl",
-    660: "mobile home, manufactured home",
-    661: "Model T",
-    662: "modem",
-    663: "monastery",
-    664: "monitor",
-    665: "moped",
-    666: "mortar",
-    667: "mortarboard",
-    668: "mosque",
-    669: "mosquito net",
-    670: "motor scooter, scooter",
-    671: "mountain bike, all-terrain bike, off-roader",
-    672: "mountain tent",
-    673: "mouse, computer mouse",
-    674: "mousetrap",
-    675: "moving van",
-    676: "muzzle",
-    677: "nail",
-    678: "neck brace",
-    679: "necklace",
-    680: "nipple",
-    681: "notebook, notebook computer",
-    682: "obelisk",
-    683: "oboe, hautboy, hautbois",
-    684: "ocarina, sweet potato",
-    685: "odometer, hodometer, mileometer, milometer",
-    686: "oil filter",
-    687: "organ, pipe organ",
-    688: "oscilloscope, scope, cathode-ray oscilloscope, CRO",
-    689: "overskirt",
-    690: "oxcart",
-    691: "oxygen mask",
-    692: "packet",
-    693: "paddle, boat paddle",
-    694: "paddlewheel, paddle wheel",
-    695: "padlock",
-    696: "paintbrush",
-    697: "pajama, pyjama, pj's, jammies",
-    698: "palace",
-    699: "panpipe, pandean pipe, syrinx",
-    700: "paper towel",
-    701: "parachute, chute",
-    702: "parallel bars, bars",
-    703: "park bench",
-    704: "parking meter",
-    705: "passenger car, coach, carriage",
-    706: "patio, terrace",
-    707: "pay-phone, pay-station",
-    708: "pedestal, plinth, footstall",
-    709: "pencil box, pencil case",
-    710: "pencil sharpener",
-    711: "perfume, essence",
-    712: "Petri dish",
-    713: "photocopier",
-    714: "pick, plectrum, plectron",
-    715: "pickelhaube",
-    716: "picket fence, paling",
-    717: "pickup, pickup truck",
-    718: "pier",
-    719: "piggy bank, penny bank",
-    720: "pill bottle",
-    721: "pillow",
-    722: "ping-pong ball",
-    723: "pinwheel",
-    724: "pirate, pirate ship",
-    725: "pitcher, ewer",
-    726: "plane, carpenter's plane, woodworking plane",
-    727: "planetarium",
-    728: "plastic bag",
-    729: "plate rack",
-    730: "plow, plough",
-    731: "plunger, plumber's helper",
-    732: "Polaroid camera, Polaroid Land camera",
-    733: "pole",
-    734: "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
-    735: "poncho",
-    736: "pool table, billiard table, snooker table",
-    737: "pop bottle, soda bottle",
-    738: "pot, flowerpot",
-    739: "potter's wheel",
-    740: "power drill",
-    741: "prayer rug, prayer mat",
-    742: "printer",
-    743: "prison, prison house",
-    744: "projectile, missile",
-    745: "projector",
-    746: "puck, hockey puck",
-    747: "punching bag, punch bag, punching ball, punchball",
-    748: "purse",
-    749: "quill, quill pen",
-    750: "quilt, comforter, comfort, puff",
-    751: "racer, race car, racing car",
-    752: "racket, racquet",
-    753: "radiator",
-    754: "radio, wireless",
-    755: "radio telescope, radio reflector",
-    756: "rain barrel",
-    757: "recreational vehicle, RV, R.V.",
-    758: "reel",
-    759: "reflex camera",
-    760: "refrigerator, icebox",
-    761: "remote control, remote",
-    762: "restaurant, eating house, eating place, eatery",
-    763: "revolver, six-gun, six-shooter",
-    764: "rifle",
-    765: "rocking chair, rocker",
-    766: "rotisserie",
-    767: "rubber eraser, rubber, pencil eraser",
-    768: "rugby ball",
-    769: "rule, ruler",
-    770: "running shoe",
-    771: "safe",
-    772: "safety pin",
-    773: "saltshaker, salt shaker",
-    774: "sandal",
-    775: "sarong",
-    776: "sax, saxophone",
-    777: "scabbard",
-    778: "scale, weighing machine",
-    779: "school bus",
-    780: "schooner",
-    781: "scoreboard",
-    782: "screen, CRT screen",
-    783: "screw",
-    784: "screwdriver",
-    785: "seat belt, seatbelt",
-    786: "sewing machine",
-    787: "shield, buckler",
-    788: "shoe shop, shoe-shop, shoe store",
-    789: "shoji",
-    790: "shopping basket",
-    791: "shopping cart",
-    792: "shovel",
-    793: "shower cap",
-    794: "shower curtain",
-    795: "ski",
-    796: "ski mask",
-    797: "sleeping bag",
-    798: "slide rule, slipstick",
-    799: "sliding door",
-    800: "slot, one-armed bandit",
-    801: "snorkel",
-    802: "snowmobile",
-    803: "snowplow, snowplough",
-    804: "soap dispenser",
-    805: "soccer ball",
-    806: "sock",
-    807: "solar dish, solar collector, solar furnace",
-    808: "sombrero",
-    809: "soup bowl",
-    810: "space bar",
-    811: "space heater",
-    812: "space shuttle",
-    813: "spatula",
-    814: "speedboat",
-    815: "spider web, spider's web",
-    816: "spindle",
-    817: "sports car, sport car",
-    818: "spotlight, spot",
-    819: "stage",
-    820: "steam locomotive",
-    821: "steel arch bridge",
-    822: "steel drum",
-    823: "stethoscope",
-    824: "stole",
-    825: "stone wall",
-    826: "stopwatch, stop watch",
-    827: "stove",
-    828: "strainer",
-    829: "streetcar, tram, tramcar, trolley, trolley car",
-    830: "stretcher",
-    831: "studio couch, day bed",
-    832: "stupa, tope",
-    833: "submarine, pigboat, sub, U-boat",
-    834: "suit, suit of clothes",
-    835: "sundial",
-    836: "sunglass",
-    837: "sunglasses, dark glasses, shades",
-    838: "sunscreen, sunblock, sun blocker",
-    839: "suspension bridge",
-    840: "swab, swob, mop",
-    841: "sweatshirt",
-    842: "swimming trunks, bathing trunks",
-    843: "swing",
-    844: "switch, electric switch, electrical switch",
-    845: "syringe",
-    846: "table lamp",
-    847: "tank, army tank, armored combat vehicle, armoured combat vehicle",
-    848: "tape player",
-    849: "teapot",
-    850: "teddy, teddy bear",
-    851: "television, television system",
-    852: "tennis ball",
-    853: "thatch, thatched roof",
-    854: "theater curtain, theatre curtain",
-    855: "thimble",
-    856: "thresher, thrasher, threshing machine",
-    857: "throne",
-    858: "tile roof",
-    859: "toaster",
-    860: "tobacco shop, tobacconist shop, tobacconist",
-    861: "toilet seat",
-    862: "torch",
-    863: "totem pole",
-    864: "tow truck, tow car, wrecker",
-    865: "toyshop",
-    866: "tractor",
-    867: "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
-    868: "tray",
-    869: "trench coat",
-    870: "tricycle, trike, velocipede",
-    871: "trimaran",
-    872: "tripod",
-    873: "triumphal arch",
-    874: "trolleybus, trolley coach, trackless trolley",
-    875: "trombone",
-    876: "tub, vat",
-    877: "turnstile",
-    878: "typewriter keyboard",
-    879: "umbrella",
-    880: "unicycle, monocycle",
-    881: "upright, upright piano",
-    882: "vacuum, vacuum cleaner",
-    883: "vase",
-    884: "vault",
-    885: "velvet",
-    886: "vending machine",
-    887: "vestment",
-    888: "viaduct",
-    889: "violin, fiddle",
-    890: "volleyball",
-    891: "waffle iron",
-    892: "wall clock",
-    893: "wallet, billfold, notecase, pocketbook",
-    894: "wardrobe, closet, press",
-    895: "warplane, military plane",
-    896: "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
-    897: "washer, automatic washer, washing machine",
-    898: "water bottle",
-    899: "water jug",
-    900: "water tower",
-    901: "whiskey jug",
-    902: "whistle",
-    903: "wig",
-    904: "window screen",
-    905: "window shade",
-    906: "Windsor tie",
-    907: "wine bottle",
-    908: "wing",
-    909: "wok",
-    910: "wooden spoon",
-    911: "wool, woolen, woollen",
-    912: "worm fence, snake fence, snake-rail fence, Virginia fence",
-    913: "wreck",
-    914: "yawl",
-    915: "yurt",
-    916: "web site, website, internet site, site",
-    917: "comic book",
-    918: "crossword puzzle, crossword",
-    919: "street sign",
-    920: "traffic light, traffic signal, stoplight",
-    921: "book jacket, dust cover, dust jacket, dust wrapper",
-    922: "menu",
-    923: "plate",
-    924: "guacamole",
-    925: "consomme",
-    926: "hot pot, hotpot",
-    927: "trifle",
-    928: "ice cream, icecream",
-    929: "ice lolly, lolly, lollipop, popsicle",
-    930: "French loaf",
-    931: "bagel, beigel",
-    932: "pretzel",
-    933: "cheeseburger",
-    934: "hotdog, hot dog, red hot",
-    935: "mashed potato",
-    936: "head cabbage",
-    937: "broccoli",
-    938: "cauliflower",
-    939: "zucchini, courgette",
-    940: "spaghetti squash",
-    941: "acorn squash",
-    942: "butternut squash",
-    943: "cucumber, cuke",
-    944: "artichoke, globe artichoke",
-    945: "bell pepper",
-    946: "cardoon",
-    947: "mushroom",
-    948: "Granny Smith",
-    949: "strawberry",
-    950: "orange",
-    951: "lemon",
-    952: "fig",
-    953: "pineapple, ananas",
-    954: "banana",
-    955: "jackfruit, jak, jack",
-    956: "custard apple",
-    957: "pomegranate",
-    958: "hay",
-    959: "carbonara",
-    960: "chocolate sauce, chocolate syrup",
-    961: "dough",
-    962: "meat loaf, meatloaf",
-    963: "pizza, pizza pie",
-    964: "potpie",
-    965: "burrito",
-    966: "red wine",
-    967: "espresso",
-    968: "cup",
-    969: "eggnog",
-    970: "alp",
-    971: "bubble",
-    972: "cliff, drop, drop-off",
-    973: "coral reef",
-    974: "geyser",
-    975: "lakeside, lakeshore",
-    976: "promontory, headland, head, foreland",
-    977: "sandbar, sand bar",
-    978: "seashore, coast, seacoast, sea-coast",
-    979: "valley, vale",
-    980: "volcano",
-    981: "ballplayer, baseball player",
-    982: "groom, bridegroom",
-    983: "scuba diver",
-    984: "rapeseed",
-    985: "daisy",
-    986: "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
-    987: "corn",
-    988: "acorn",
-    989: "hip, rose hip, rosehip",
-    990: "buckeye, horse chestnut, conker",
-    991: "coral fungus",
-    992: "agaric",
-    993: "gyromitra",
-    994: "stinkhorn, carrion fungus",
-    995: "earthstar",
-    996: "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
-    997: "bolete",
-    998: "ear, spike, capitulum",
-    999: "toilet tissue, toilet paper, bathroom tissue",
-}
-
 
 # here we list all keys to be renamed (original name on the left, our name on the right)
 def create_rename_keys(config, base_model=False):
@@ -1165,8 +162,8 @@ def prepare_img(image_resolution):
     im = Image.open(requests.get(url, stream=True).raw)
 
     # standard PyTorch mean-std input image normalization
-    transform = T.Compose(
-        [T.Resize((image_resolution, image_resolution)), T.ToTensor(), T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])]
+    transform = Compose(
+        [Resize((image_resolution, image_resolution)), ToTensor(), Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])]
     )
 
     # mean-std normalize the input image (batch-size: 1)
@@ -1191,8 +188,8 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, base_model=False)
         config.use_pooler = True
     else:
         config.num_labels = 1000
-        config.id2label = id2class
-        config.label2id = {v: k for k, v in id2class.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
         config.patch_size = int(vit_name[-6:-4])
         config.image_size = int(vit_name[-3:])
     # size of the architecture
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index 08aae9b49f1b5a..9166ca792b5c25 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -36,32 +36,24 @@ class ViTFeatureExtractor(FeatureExtractionMixin):
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        image_mean (:obj:`int`, defaults to [0.5, 0.5, 0.5]):
+        image_mean (:obj:`int`, defaults to :obj:`[0.5, 0.5, 0.5]`):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`int`, defaults to [0.5, 0.5, 0.5]):
+        image_std (:obj:`int`, defaults to :obj:`[0.5, 0.5, 0.5]`):
             The sequence of standard deviations for each channel, to be used when normalizing images.
         do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to normalize the input with mean and standard deviation.
         do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int`, `optional`, defaults to :obj:`224`):
+        size (:obj:`int`, `optional`, defaults to 224):
             Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`.
     """
 
     model_input_names = ["pixel_values"]
 
-    def __init__(
-        self,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_normalize=True,
-        do_resize=True,
-        size=224,
-        **kwargs
-    ):
+    def __init__(self, image_mean=None, image_std=None, do_normalize=True, do_resize=True, size=224, **kwargs):
         super().__init__(**kwargs)
-        self.image_mean = image_mean
-        self.image_std = image_std
+        self.image_mean = [0.5, 0.5, 0.5]
+        self.image_std = [0.5, 0.5, 0.5]
         self.do_normalize = do_normalize
         self.do_resize = do_resize
         self.size = size
@@ -74,6 +66,11 @@ def __call__(
         """
         Main method to prepare for the model one or several image(s).
 
+        .. warning::
+
+           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+           PIL images.
+
         Args:
             images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or a PyTorch
diff --git a/src/transformers/utils/imagenet_classes.py b/src/transformers/utils/imagenet_classes.py
new file mode 100644
index 00000000000000..73d831095c59c5
--- /dev/null
+++ b/src/transformers/utils/imagenet_classes.py
@@ -0,0 +1,1003 @@
+# ImageNet 2012 id's to class names
+id2label = {
+    0: "tench, Tinca tinca",
+    1: "goldfish, Carassius auratus",
+    2: "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+    3: "tiger shark, Galeocerdo cuvieri",
+    4: "hammerhead, hammerhead shark",
+    5: "electric ray, crampfish, numbfish, torpedo",
+    6: "stingray",
+    7: "cock",
+    8: "hen",
+    9: "ostrich, Struthio camelus",
+    10: "brambling, Fringilla montifringilla",
+    11: "goldfinch, Carduelis carduelis",
+    12: "house finch, linnet, Carpodacus mexicanus",
+    13: "junco, snowbird",
+    14: "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    15: "robin, American robin, Turdus migratorius",
+    16: "bulbul",
+    17: "jay",
+    18: "magpie",
+    19: "chickadee",
+    20: "water ouzel, dipper",
+    21: "kite",
+    22: "bald eagle, American eagle, Haliaeetus leucocephalus",
+    23: "vulture",
+    24: "great grey owl, great gray owl, Strix nebulosa",
+    25: "European fire salamander, Salamandra salamandra",
+    26: "common newt, Triturus vulgaris",
+    27: "eft",
+    28: "spotted salamander, Ambystoma maculatum",
+    29: "axolotl, mud puppy, Ambystoma mexicanum",
+    30: "bullfrog, Rana catesbeiana",
+    31: "tree frog, tree-frog",
+    32: "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    33: "loggerhead, loggerhead turtle, Caretta caretta",
+    34: "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    35: "mud turtle",
+    36: "terrapin",
+    37: "box turtle, box tortoise",
+    38: "banded gecko",
+    39: "common iguana, iguana, Iguana iguana",
+    40: "American chameleon, anole, Anolis carolinensis",
+    41: "whiptail, whiptail lizard",
+    42: "agama",
+    43: "frilled lizard, Chlamydosaurus kingi",
+    44: "alligator lizard",
+    45: "Gila monster, Heloderma suspectum",
+    46: "green lizard, Lacerta viridis",
+    47: "African chameleon, Chamaeleo chamaeleon",
+    48: "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+    49: "African crocodile, Nile crocodile, Crocodylus niloticus",
+    50: "American alligator, Alligator mississipiensis",
+    51: "triceratops",
+    52: "thunder snake, worm snake, Carphophis amoenus",
+    53: "ringneck snake, ring-necked snake, ring snake",
+    54: "hognose snake, puff adder, sand viper",
+    55: "green snake, grass snake",
+    56: "king snake, kingsnake",
+    57: "garter snake, grass snake",
+    58: "water snake",
+    59: "vine snake",
+    60: "night snake, Hypsiglena torquata",
+    61: "boa constrictor, Constrictor constrictor",
+    62: "rock python, rock snake, Python sebae",
+    63: "Indian cobra, Naja naja",
+    64: "green mamba",
+    65: "sea snake",
+    66: "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    67: "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    68: "sidewinder, horned rattlesnake, Crotalus cerastes",
+    69: "trilobite",
+    70: "harvestman, daddy longlegs, Phalangium opilio",
+    71: "scorpion",
+    72: "black and gold garden spider, Argiope aurantia",
+    73: "barn spider, Araneus cavaticus",
+    74: "garden spider, Aranea diademata",
+    75: "black widow, Latrodectus mactans",
+    76: "tarantula",
+    77: "wolf spider, hunting spider",
+    78: "tick",
+    79: "centipede",
+    80: "black grouse",
+    81: "ptarmigan",
+    82: "ruffed grouse, partridge, Bonasa umbellus",
+    83: "prairie chicken, prairie grouse, prairie fowl",
+    84: "peacock",
+    85: "quail",
+    86: "partridge",
+    87: "African grey, African gray, Psittacus erithacus",
+    88: "macaw",
+    89: "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    90: "lorikeet",
+    91: "coucal",
+    92: "bee eater",
+    93: "hornbill",
+    94: "hummingbird",
+    95: "jacamar",
+    96: "toucan",
+    97: "drake",
+    98: "red-breasted merganser, Mergus serrator",
+    99: "goose",
+    100: "black swan, Cygnus atratus",
+    101: "tusker",
+    102: "echidna, spiny anteater, anteater",
+    103: "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+    104: "wallaby, brush kangaroo",
+    105: "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    106: "wombat",
+    107: "jellyfish",
+    108: "sea anemone, anemone",
+    109: "brain coral",
+    110: "flatworm, platyhelminth",
+    111: "nematode, nematode worm, roundworm",
+    112: "conch",
+    113: "snail",
+    114: "slug",
+    115: "sea slug, nudibranch",
+    116: "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    117: "chambered nautilus, pearly nautilus, nautilus",
+    118: "Dungeness crab, Cancer magister",
+    119: "rock crab, Cancer irroratus",
+    120: "fiddler crab",
+    121: "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+    122: "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    123: "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    124: "crayfish, crawfish, crawdad, crawdaddy",
+    125: "hermit crab",
+    126: "isopod",
+    127: "white stork, Ciconia ciconia",
+    128: "black stork, Ciconia nigra",
+    129: "spoonbill",
+    130: "flamingo",
+    131: "little blue heron, Egretta caerulea",
+    132: "American egret, great white heron, Egretta albus",
+    133: "bittern",
+    134: "crane",
+    135: "limpkin, Aramus pictus",
+    136: "European gallinule, Porphyrio porphyrio",
+    137: "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    138: "bustard",
+    139: "ruddy turnstone, Arenaria interpres",
+    140: "red-backed sandpiper, dunlin, Erolia alpina",
+    141: "redshank, Tringa totanus",
+    142: "dowitcher",
+    143: "oystercatcher, oyster catcher",
+    144: "pelican",
+    145: "king penguin, Aptenodytes patagonica",
+    146: "albatross, mollymawk",
+    147: "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+    148: "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    149: "dugong, Dugong dugon",
+    150: "sea lion",
+    151: "Chihuahua",
+    152: "Japanese spaniel",
+    153: "Maltese dog, Maltese terrier, Maltese",
+    154: "Pekinese, Pekingese, Peke",
+    155: "Shih-Tzu",
+    156: "Blenheim spaniel",
+    157: "papillon",
+    158: "toy terrier",
+    159: "Rhodesian ridgeback",
+    160: "Afghan hound, Afghan",
+    161: "basset, basset hound",
+    162: "beagle",
+    163: "bloodhound, sleuthhound",
+    164: "bluetick",
+    165: "black-and-tan coonhound",
+    166: "Walker hound, Walker foxhound",
+    167: "English foxhound",
+    168: "redbone",
+    169: "borzoi, Russian wolfhound",
+    170: "Irish wolfhound",
+    171: "Italian greyhound",
+    172: "whippet",
+    173: "Ibizan hound, Ibizan Podenco",
+    174: "Norwegian elkhound, elkhound",
+    175: "otterhound, otter hound",
+    176: "Saluki, gazelle hound",
+    177: "Scottish deerhound, deerhound",
+    178: "Weimaraner",
+    179: "Staffordshire bullterrier, Staffordshire bull terrier",
+    180: "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+    181: "Bedlington terrier",
+    182: "Border terrier",
+    183: "Kerry blue terrier",
+    184: "Irish terrier",
+    185: "Norfolk terrier",
+    186: "Norwich terrier",
+    187: "Yorkshire terrier",
+    188: "wire-haired fox terrier",
+    189: "Lakeland terrier",
+    190: "Sealyham terrier, Sealyham",
+    191: "Airedale, Airedale terrier",
+    192: "cairn, cairn terrier",
+    193: "Australian terrier",
+    194: "Dandie Dinmont, Dandie Dinmont terrier",
+    195: "Boston bull, Boston terrier",
+    196: "miniature schnauzer",
+    197: "giant schnauzer",
+    198: "standard schnauzer",
+    199: "Scotch terrier, Scottish terrier, Scottie",
+    200: "Tibetan terrier, chrysanthemum dog",
+    201: "silky terrier, Sydney silky",
+    202: "soft-coated wheaten terrier",
+    203: "West Highland white terrier",
+    204: "Lhasa, Lhasa apso",
+    205: "flat-coated retriever",
+    206: "curly-coated retriever",
+    207: "golden retriever",
+    208: "Labrador retriever",
+    209: "Chesapeake Bay retriever",
+    210: "German short-haired pointer",
+    211: "vizsla, Hungarian pointer",
+    212: "English setter",
+    213: "Irish setter, red setter",
+    214: "Gordon setter",
+    215: "Brittany spaniel",
+    216: "clumber, clumber spaniel",
+    217: "English springer, English springer spaniel",
+    218: "Welsh springer spaniel",
+    219: "cocker spaniel, English cocker spaniel, cocker",
+    220: "Sussex spaniel",
+    221: "Irish water spaniel",
+    222: "kuvasz",
+    223: "schipperke",
+    224: "groenendael",
+    225: "malinois",
+    226: "briard",
+    227: "kelpie",
+    228: "komondor",
+    229: "Old English sheepdog, bobtail",
+    230: "Shetland sheepdog, Shetland sheep dog, Shetland",
+    231: "collie",
+    232: "Border collie",
+    233: "Bouvier des Flandres, Bouviers des Flandres",
+    234: "Rottweiler",
+    235: "German shepherd, German shepherd dog, German police dog, alsatian",
+    236: "Doberman, Doberman pinscher",
+    237: "miniature pinscher",
+    238: "Greater Swiss Mountain dog",
+    239: "Bernese mountain dog",
+    240: "Appenzeller",
+    241: "EntleBucher",
+    242: "boxer",
+    243: "bull mastiff",
+    244: "Tibetan mastiff",
+    245: "French bulldog",
+    246: "Great Dane",
+    247: "Saint Bernard, St Bernard",
+    248: "Eskimo dog, husky",
+    249: "malamute, malemute, Alaskan malamute",
+    250: "Siberian husky",
+    251: "dalmatian, coach dog, carriage dog",
+    252: "affenpinscher, monkey pinscher, monkey dog",
+    253: "basenji",
+    254: "pug, pug-dog",
+    255: "Leonberg",
+    256: "Newfoundland, Newfoundland dog",
+    257: "Great Pyrenees",
+    258: "Samoyed, Samoyede",
+    259: "Pomeranian",
+    260: "chow, chow chow",
+    261: "keeshond",
+    262: "Brabancon griffon",
+    263: "Pembroke, Pembroke Welsh corgi",
+    264: "Cardigan, Cardigan Welsh corgi",
+    265: "toy poodle",
+    266: "miniature poodle",
+    267: "standard poodle",
+    268: "Mexican hairless",
+    269: "timber wolf, grey wolf, gray wolf, Canis lupus",
+    270: "white wolf, Arctic wolf, Canis lupus tundrarum",
+    271: "red wolf, maned wolf, Canis rufus, Canis niger",
+    272: "coyote, prairie wolf, brush wolf, Canis latrans",
+    273: "dingo, warrigal, warragal, Canis dingo",
+    274: "dhole, Cuon alpinus",
+    275: "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    276: "hyena, hyaena",
+    277: "red fox, Vulpes vulpes",
+    278: "kit fox, Vulpes macrotis",
+    279: "Arctic fox, white fox, Alopex lagopus",
+    280: "grey fox, gray fox, Urocyon cinereoargenteus",
+    281: "tabby, tabby cat",
+    282: "tiger cat",
+    283: "Persian cat",
+    284: "Siamese cat, Siamese",
+    285: "Egyptian cat",
+    286: "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    287: "lynx, catamount",
+    288: "leopard, Panthera pardus",
+    289: "snow leopard, ounce, Panthera uncia",
+    290: "jaguar, panther, Panthera onca, Felis onca",
+    291: "lion, king of beasts, Panthera leo",
+    292: "tiger, Panthera tigris",
+    293: "cheetah, chetah, Acinonyx jubatus",
+    294: "brown bear, bruin, Ursus arctos",
+    295: "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    296: "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    297: "sloth bear, Melursus ursinus, Ursus ursinus",
+    298: "mongoose",
+    299: "meerkat, mierkat",
+    300: "tiger beetle",
+    301: "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    302: "ground beetle, carabid beetle",
+    303: "long-horned beetle, longicorn, longicorn beetle",
+    304: "leaf beetle, chrysomelid",
+    305: "dung beetle",
+    306: "rhinoceros beetle",
+    307: "weevil",
+    308: "fly",
+    309: "bee",
+    310: "ant, emmet, pismire",
+    311: "grasshopper, hopper",
+    312: "cricket",
+    313: "walking stick, walkingstick, stick insect",
+    314: "cockroach, roach",
+    315: "mantis, mantid",
+    316: "cicada, cicala",
+    317: "leafhopper",
+    318: "lacewing, lacewing fly",
+    319: "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    320: "damselfly",
+    321: "admiral",
+    322: "ringlet, ringlet butterfly",
+    323: "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    324: "cabbage butterfly",
+    325: "sulphur butterfly, sulfur butterfly",
+    326: "lycaenid, lycaenid butterfly",
+    327: "starfish, sea star",
+    328: "sea urchin",
+    329: "sea cucumber, holothurian",
+    330: "wood rabbit, cottontail, cottontail rabbit",
+    331: "hare",
+    332: "Angora, Angora rabbit",
+    333: "hamster",
+    334: "porcupine, hedgehog",
+    335: "fox squirrel, eastern fox squirrel, Sciurus niger",
+    336: "marmot",
+    337: "beaver",
+    338: "guinea pig, Cavia cobaya",
+    339: "sorrel",
+    340: "zebra",
+    341: "hog, pig, grunter, squealer, Sus scrofa",
+    342: "wild boar, boar, Sus scrofa",
+    343: "warthog",
+    344: "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    345: "ox",
+    346: "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    347: "bison",
+    348: "ram, tup",
+    349: "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+    350: "ibex, Capra ibex",
+    351: "hartebeest",
+    352: "impala, Aepyceros melampus",
+    353: "gazelle",
+    354: "Arabian camel, dromedary, Camelus dromedarius",
+    355: "llama",
+    356: "weasel",
+    357: "mink",
+    358: "polecat, fitch, foulmart, foumart, Mustela putorius",
+    359: "black-footed ferret, ferret, Mustela nigripes",
+    360: "otter",
+    361: "skunk, polecat, wood pussy",
+    362: "badger",
+    363: "armadillo",
+    364: "three-toed sloth, ai, Bradypus tridactylus",
+    365: "orangutan, orang, orangutang, Pongo pygmaeus",
+    366: "gorilla, Gorilla gorilla",
+    367: "chimpanzee, chimp, Pan troglodytes",
+    368: "gibbon, Hylobates lar",
+    369: "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    370: "guenon, guenon monkey",
+    371: "patas, hussar monkey, Erythrocebus patas",
+    372: "baboon",
+    373: "macaque",
+    374: "langur",
+    375: "colobus, colobus monkey",
+    376: "proboscis monkey, Nasalis larvatus",
+    377: "marmoset",
+    378: "capuchin, ringtail, Cebus capucinus",
+    379: "howler monkey, howler",
+    380: "titi, titi monkey",
+    381: "spider monkey, Ateles geoffroyi",
+    382: "squirrel monkey, Saimiri sciureus",
+    383: "Madagascar cat, ring-tailed lemur, Lemur catta",
+    384: "indri, indris, Indri indri, Indri brevicaudatus",
+    385: "Indian elephant, Elephas maximus",
+    386: "African elephant, Loxodonta africana",
+    387: "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    388: "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    389: "barracouta, snoek",
+    390: "eel",
+    391: "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    392: "rock beauty, Holocanthus tricolor",
+    393: "anemone fish",
+    394: "sturgeon",
+    395: "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    396: "lionfish",
+    397: "puffer, pufferfish, blowfish, globefish",
+    398: "abacus",
+    399: "abaya",
+    400: "academic gown, academic robe, judge's robe",
+    401: "accordion, piano accordion, squeeze box",
+    402: "acoustic guitar",
+    403: "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    404: "airliner",
+    405: "airship, dirigible",
+    406: "altar",
+    407: "ambulance",
+    408: "amphibian, amphibious vehicle",
+    409: "analog clock",
+    410: "apiary, bee house",
+    411: "apron",
+    412: "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+    413: "assault rifle, assault gun",
+    414: "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    415: "bakery, bakeshop, bakehouse",
+    416: "balance beam, beam",
+    417: "balloon",
+    418: "ballpoint, ballpoint pen, ballpen, Biro",
+    419: "Band Aid",
+    420: "banjo",
+    421: "bannister, banister, balustrade, balusters, handrail",
+    422: "barbell",
+    423: "barber chair",
+    424: "barbershop",
+    425: "barn",
+    426: "barometer",
+    427: "barrel, cask",
+    428: "barrow, garden cart, lawn cart, wheelbarrow",
+    429: "baseball",
+    430: "basketball",
+    431: "bassinet",
+    432: "bassoon",
+    433: "bathing cap, swimming cap",
+    434: "bath towel",
+    435: "bathtub, bathing tub, bath, tub",
+    436: "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+    437: "beacon, lighthouse, beacon light, pharos",
+    438: "beaker",
+    439: "bearskin, busby, shako",
+    440: "beer bottle",
+    441: "beer glass",
+    442: "bell cote, bell cot",
+    443: "bib",
+    444: "bicycle-built-for-two, tandem bicycle, tandem",
+    445: "bikini, two-piece",
+    446: "binder, ring-binder",
+    447: "binoculars, field glasses, opera glasses",
+    448: "birdhouse",
+    449: "boathouse",
+    450: "bobsled, bobsleigh, bob",
+    451: "bolo tie, bolo, bola tie, bola",
+    452: "bonnet, poke bonnet",
+    453: "bookcase",
+    454: "bookshop, bookstore, bookstall",
+    455: "bottlecap",
+    456: "bow",
+    457: "bow tie, bow-tie, bowtie",
+    458: "brass, memorial tablet, plaque",
+    459: "brassiere, bra, bandeau",
+    460: "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    461: "breastplate, aegis, egis",
+    462: "broom",
+    463: "bucket, pail",
+    464: "buckle",
+    465: "bulletproof vest",
+    466: "bullet train, bullet",
+    467: "butcher shop, meat market",
+    468: "cab, hack, taxi, taxicab",
+    469: "caldron, cauldron",
+    470: "candle, taper, wax light",
+    471: "cannon",
+    472: "canoe",
+    473: "can opener, tin opener",
+    474: "cardigan",
+    475: "car mirror",
+    476: "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    477: "carpenter's kit, tool kit",
+    478: "carton",
+    479: "car wheel",
+    480: "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+    481: "cassette",
+    482: "cassette player",
+    483: "castle",
+    484: "catamaran",
+    485: "CD player",
+    486: "cello, violoncello",
+    487: "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    488: "chain",
+    489: "chainlink fence",
+    490: "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+    491: "chain saw, chainsaw",
+    492: "chest",
+    493: "chiffonier, commode",
+    494: "chime, bell, gong",
+    495: "china cabinet, china closet",
+    496: "Christmas stocking",
+    497: "church, church building",
+    498: "cinema, movie theater, movie theatre, movie house, picture palace",
+    499: "cleaver, meat cleaver, chopper",
+    500: "cliff dwelling",
+    501: "cloak",
+    502: "clog, geta, patten, sabot",
+    503: "cocktail shaker",
+    504: "coffee mug",
+    505: "coffeepot",
+    506: "coil, spiral, volute, whorl, helix",
+    507: "combination lock",
+    508: "computer keyboard, keypad",
+    509: "confectionery, confectionary, candy store",
+    510: "container ship, containership, container vessel",
+    511: "convertible",
+    512: "corkscrew, bottle screw",
+    513: "cornet, horn, trumpet, trump",
+    514: "cowboy boot",
+    515: "cowboy hat, ten-gallon hat",
+    516: "cradle",
+    517: "crane",
+    518: "crash helmet",
+    519: "crate",
+    520: "crib, cot",
+    521: "Crock Pot",
+    522: "croquet ball",
+    523: "crutch",
+    524: "cuirass",
+    525: "dam, dike, dyke",
+    526: "desk",
+    527: "desktop computer",
+    528: "dial telephone, dial phone",
+    529: "diaper, nappy, napkin",
+    530: "digital clock",
+    531: "digital watch",
+    532: "dining table, board",
+    533: "dishrag, dishcloth",
+    534: "dishwasher, dish washer, dishwashing machine",
+    535: "disk brake, disc brake",
+    536: "dock, dockage, docking facility",
+    537: "dogsled, dog sled, dog sleigh",
+    538: "dome",
+    539: "doormat, welcome mat",
+    540: "drilling platform, offshore rig",
+    541: "drum, membranophone, tympan",
+    542: "drumstick",
+    543: "dumbbell",
+    544: "Dutch oven",
+    545: "electric fan, blower",
+    546: "electric guitar",
+    547: "electric locomotive",
+    548: "entertainment center",
+    549: "envelope",
+    550: "espresso maker",
+    551: "face powder",
+    552: "feather boa, boa",
+    553: "file, file cabinet, filing cabinet",
+    554: "fireboat",
+    555: "fire engine, fire truck",
+    556: "fire screen, fireguard",
+    557: "flagpole, flagstaff",
+    558: "flute, transverse flute",
+    559: "folding chair",
+    560: "football helmet",
+    561: "forklift",
+    562: "fountain",
+    563: "fountain pen",
+    564: "four-poster",
+    565: "freight car",
+    566: "French horn, horn",
+    567: "frying pan, frypan, skillet",
+    568: "fur coat",
+    569: "garbage truck, dustcart",
+    570: "gasmask, respirator, gas helmet",
+    571: "gas pump, gasoline pump, petrol pump, island dispenser",
+    572: "goblet",
+    573: "go-kart",
+    574: "golf ball",
+    575: "golfcart, golf cart",
+    576: "gondola",
+    577: "gong, tam-tam",
+    578: "gown",
+    579: "grand piano, grand",
+    580: "greenhouse, nursery, glasshouse",
+    581: "grille, radiator grille",
+    582: "grocery store, grocery, food market, market",
+    583: "guillotine",
+    584: "hair slide",
+    585: "hair spray",
+    586: "half track",
+    587: "hammer",
+    588: "hamper",
+    589: "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    590: "hand-held computer, hand-held microcomputer",
+    591: "handkerchief, hankie, hanky, hankey",
+    592: "hard disc, hard disk, fixed disk",
+    593: "harmonica, mouth organ, harp, mouth harp",
+    594: "harp",
+    595: "harvester, reaper",
+    596: "hatchet",
+    597: "holster",
+    598: "home theater, home theatre",
+    599: "honeycomb",
+    600: "hook, claw",
+    601: "hoopskirt, crinoline",
+    602: "horizontal bar, high bar",
+    603: "horse cart, horse-cart",
+    604: "hourglass",
+    605: "iPod",
+    606: "iron, smoothing iron",
+    607: "jack-o'-lantern",
+    608: "jean, blue jean, denim",
+    609: "jeep, landrover",
+    610: "jersey, T-shirt, tee shirt",
+    611: "jigsaw puzzle",
+    612: "jinrikisha, ricksha, rickshaw",
+    613: "joystick",
+    614: "kimono",
+    615: "knee pad",
+    616: "knot",
+    617: "lab coat, laboratory coat",
+    618: "ladle",
+    619: "lampshade, lamp shade",
+    620: "laptop, laptop computer",
+    621: "lawn mower, mower",
+    622: "lens cap, lens cover",
+    623: "letter opener, paper knife, paperknife",
+    624: "library",
+    625: "lifeboat",
+    626: "lighter, light, igniter, ignitor",
+    627: "limousine, limo",
+    628: "liner, ocean liner",
+    629: "lipstick, lip rouge",
+    630: "Loafer",
+    631: "lotion",
+    632: "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    633: "loupe, jeweler's loupe",
+    634: "lumbermill, sawmill",
+    635: "magnetic compass",
+    636: "mailbag, postbag",
+    637: "mailbox, letter box",
+    638: "maillot",
+    639: "maillot, tank suit",
+    640: "manhole cover",
+    641: "maraca",
+    642: "marimba, xylophone",
+    643: "mask",
+    644: "matchstick",
+    645: "maypole",
+    646: "maze, labyrinth",
+    647: "measuring cup",
+    648: "medicine chest, medicine cabinet",
+    649: "megalith, megalithic structure",
+    650: "microphone, mike",
+    651: "microwave, microwave oven",
+    652: "military uniform",
+    653: "milk can",
+    654: "minibus",
+    655: "miniskirt, mini",
+    656: "minivan",
+    657: "missile",
+    658: "mitten",
+    659: "mixing bowl",
+    660: "mobile home, manufactured home",
+    661: "Model T",
+    662: "modem",
+    663: "monastery",
+    664: "monitor",
+    665: "moped",
+    666: "mortar",
+    667: "mortarboard",
+    668: "mosque",
+    669: "mosquito net",
+    670: "motor scooter, scooter",
+    671: "mountain bike, all-terrain bike, off-roader",
+    672: "mountain tent",
+    673: "mouse, computer mouse",
+    674: "mousetrap",
+    675: "moving van",
+    676: "muzzle",
+    677: "nail",
+    678: "neck brace",
+    679: "necklace",
+    680: "nipple",
+    681: "notebook, notebook computer",
+    682: "obelisk",
+    683: "oboe, hautboy, hautbois",
+    684: "ocarina, sweet potato",
+    685: "odometer, hodometer, mileometer, milometer",
+    686: "oil filter",
+    687: "organ, pipe organ",
+    688: "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    689: "overskirt",
+    690: "oxcart",
+    691: "oxygen mask",
+    692: "packet",
+    693: "paddle, boat paddle",
+    694: "paddlewheel, paddle wheel",
+    695: "padlock",
+    696: "paintbrush",
+    697: "pajama, pyjama, pj's, jammies",
+    698: "palace",
+    699: "panpipe, pandean pipe, syrinx",
+    700: "paper towel",
+    701: "parachute, chute",
+    702: "parallel bars, bars",
+    703: "park bench",
+    704: "parking meter",
+    705: "passenger car, coach, carriage",
+    706: "patio, terrace",
+    707: "pay-phone, pay-station",
+    708: "pedestal, plinth, footstall",
+    709: "pencil box, pencil case",
+    710: "pencil sharpener",
+    711: "perfume, essence",
+    712: "Petri dish",
+    713: "photocopier",
+    714: "pick, plectrum, plectron",
+    715: "pickelhaube",
+    716: "picket fence, paling",
+    717: "pickup, pickup truck",
+    718: "pier",
+    719: "piggy bank, penny bank",
+    720: "pill bottle",
+    721: "pillow",
+    722: "ping-pong ball",
+    723: "pinwheel",
+    724: "pirate, pirate ship",
+    725: "pitcher, ewer",
+    726: "plane, carpenter's plane, woodworking plane",
+    727: "planetarium",
+    728: "plastic bag",
+    729: "plate rack",
+    730: "plow, plough",
+    731: "plunger, plumber's helper",
+    732: "Polaroid camera, Polaroid Land camera",
+    733: "pole",
+    734: "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    735: "poncho",
+    736: "pool table, billiard table, snooker table",
+    737: "pop bottle, soda bottle",
+    738: "pot, flowerpot",
+    739: "potter's wheel",
+    740: "power drill",
+    741: "prayer rug, prayer mat",
+    742: "printer",
+    743: "prison, prison house",
+    744: "projectile, missile",
+    745: "projector",
+    746: "puck, hockey puck",
+    747: "punching bag, punch bag, punching ball, punchball",
+    748: "purse",
+    749: "quill, quill pen",
+    750: "quilt, comforter, comfort, puff",
+    751: "racer, race car, racing car",
+    752: "racket, racquet",
+    753: "radiator",
+    754: "radio, wireless",
+    755: "radio telescope, radio reflector",
+    756: "rain barrel",
+    757: "recreational vehicle, RV, R.V.",
+    758: "reel",
+    759: "reflex camera",
+    760: "refrigerator, icebox",
+    761: "remote control, remote",
+    762: "restaurant, eating house, eating place, eatery",
+    763: "revolver, six-gun, six-shooter",
+    764: "rifle",
+    765: "rocking chair, rocker",
+    766: "rotisserie",
+    767: "rubber eraser, rubber, pencil eraser",
+    768: "rugby ball",
+    769: "rule, ruler",
+    770: "running shoe",
+    771: "safe",
+    772: "safety pin",
+    773: "saltshaker, salt shaker",
+    774: "sandal",
+    775: "sarong",
+    776: "sax, saxophone",
+    777: "scabbard",
+    778: "scale, weighing machine",
+    779: "school bus",
+    780: "schooner",
+    781: "scoreboard",
+    782: "screen, CRT screen",
+    783: "screw",
+    784: "screwdriver",
+    785: "seat belt, seatbelt",
+    786: "sewing machine",
+    787: "shield, buckler",
+    788: "shoe shop, shoe-shop, shoe store",
+    789: "shoji",
+    790: "shopping basket",
+    791: "shopping cart",
+    792: "shovel",
+    793: "shower cap",
+    794: "shower curtain",
+    795: "ski",
+    796: "ski mask",
+    797: "sleeping bag",
+    798: "slide rule, slipstick",
+    799: "sliding door",
+    800: "slot, one-armed bandit",
+    801: "snorkel",
+    802: "snowmobile",
+    803: "snowplow, snowplough",
+    804: "soap dispenser",
+    805: "soccer ball",
+    806: "sock",
+    807: "solar dish, solar collector, solar furnace",
+    808: "sombrero",
+    809: "soup bowl",
+    810: "space bar",
+    811: "space heater",
+    812: "space shuttle",
+    813: "spatula",
+    814: "speedboat",
+    815: "spider web, spider's web",
+    816: "spindle",
+    817: "sports car, sport car",
+    818: "spotlight, spot",
+    819: "stage",
+    820: "steam locomotive",
+    821: "steel arch bridge",
+    822: "steel drum",
+    823: "stethoscope",
+    824: "stole",
+    825: "stone wall",
+    826: "stopwatch, stop watch",
+    827: "stove",
+    828: "strainer",
+    829: "streetcar, tram, tramcar, trolley, trolley car",
+    830: "stretcher",
+    831: "studio couch, day bed",
+    832: "stupa, tope",
+    833: "submarine, pigboat, sub, U-boat",
+    834: "suit, suit of clothes",
+    835: "sundial",
+    836: "sunglass",
+    837: "sunglasses, dark glasses, shades",
+    838: "sunscreen, sunblock, sun blocker",
+    839: "suspension bridge",
+    840: "swab, swob, mop",
+    841: "sweatshirt",
+    842: "swimming trunks, bathing trunks",
+    843: "swing",
+    844: "switch, electric switch, electrical switch",
+    845: "syringe",
+    846: "table lamp",
+    847: "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    848: "tape player",
+    849: "teapot",
+    850: "teddy, teddy bear",
+    851: "television, television system",
+    852: "tennis ball",
+    853: "thatch, thatched roof",
+    854: "theater curtain, theatre curtain",
+    855: "thimble",
+    856: "thresher, thrasher, threshing machine",
+    857: "throne",
+    858: "tile roof",
+    859: "toaster",
+    860: "tobacco shop, tobacconist shop, tobacconist",
+    861: "toilet seat",
+    862: "torch",
+    863: "totem pole",
+    864: "tow truck, tow car, wrecker",
+    865: "toyshop",
+    866: "tractor",
+    867: "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    868: "tray",
+    869: "trench coat",
+    870: "tricycle, trike, velocipede",
+    871: "trimaran",
+    872: "tripod",
+    873: "triumphal arch",
+    874: "trolleybus, trolley coach, trackless trolley",
+    875: "trombone",
+    876: "tub, vat",
+    877: "turnstile",
+    878: "typewriter keyboard",
+    879: "umbrella",
+    880: "unicycle, monocycle",
+    881: "upright, upright piano",
+    882: "vacuum, vacuum cleaner",
+    883: "vase",
+    884: "vault",
+    885: "velvet",
+    886: "vending machine",
+    887: "vestment",
+    888: "viaduct",
+    889: "violin, fiddle",
+    890: "volleyball",
+    891: "waffle iron",
+    892: "wall clock",
+    893: "wallet, billfold, notecase, pocketbook",
+    894: "wardrobe, closet, press",
+    895: "warplane, military plane",
+    896: "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    897: "washer, automatic washer, washing machine",
+    898: "water bottle",
+    899: "water jug",
+    900: "water tower",
+    901: "whiskey jug",
+    902: "whistle",
+    903: "wig",
+    904: "window screen",
+    905: "window shade",
+    906: "Windsor tie",
+    907: "wine bottle",
+    908: "wing",
+    909: "wok",
+    910: "wooden spoon",
+    911: "wool, woolen, woollen",
+    912: "worm fence, snake fence, snake-rail fence, Virginia fence",
+    913: "wreck",
+    914: "yawl",
+    915: "yurt",
+    916: "web site, website, internet site, site",
+    917: "comic book",
+    918: "crossword puzzle, crossword",
+    919: "street sign",
+    920: "traffic light, traffic signal, stoplight",
+    921: "book jacket, dust cover, dust jacket, dust wrapper",
+    922: "menu",
+    923: "plate",
+    924: "guacamole",
+    925: "consomme",
+    926: "hot pot, hotpot",
+    927: "trifle",
+    928: "ice cream, icecream",
+    929: "ice lolly, lolly, lollipop, popsicle",
+    930: "French loaf",
+    931: "bagel, beigel",
+    932: "pretzel",
+    933: "cheeseburger",
+    934: "hotdog, hot dog, red hot",
+    935: "mashed potato",
+    936: "head cabbage",
+    937: "broccoli",
+    938: "cauliflower",
+    939: "zucchini, courgette",
+    940: "spaghetti squash",
+    941: "acorn squash",
+    942: "butternut squash",
+    943: "cucumber, cuke",
+    944: "artichoke, globe artichoke",
+    945: "bell pepper",
+    946: "cardoon",
+    947: "mushroom",
+    948: "Granny Smith",
+    949: "strawberry",
+    950: "orange",
+    951: "lemon",
+    952: "fig",
+    953: "pineapple, ananas",
+    954: "banana",
+    955: "jackfruit, jak, jack",
+    956: "custard apple",
+    957: "pomegranate",
+    958: "hay",
+    959: "carbonara",
+    960: "chocolate sauce, chocolate syrup",
+    961: "dough",
+    962: "meat loaf, meatloaf",
+    963: "pizza, pizza pie",
+    964: "potpie",
+    965: "burrito",
+    966: "red wine",
+    967: "espresso",
+    968: "cup",
+    969: "eggnog",
+    970: "alp",
+    971: "bubble",
+    972: "cliff, drop, drop-off",
+    973: "coral reef",
+    974: "geyser",
+    975: "lakeside, lakeshore",
+    976: "promontory, headland, head, foreland",
+    977: "sandbar, sand bar",
+    978: "seashore, coast, seacoast, sea-coast",
+    979: "valley, vale",
+    980: "volcano",
+    981: "ballplayer, baseball player",
+    982: "groom, bridegroom",
+    983: "scuba diver",
+    984: "rapeseed",
+    985: "daisy",
+    986: "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+    987: "corn",
+    988: "acorn",
+    989: "hip, rose hip, rosehip",
+    990: "buckeye, horse chestnut, conker",
+    991: "coral fungus",
+    992: "agaric",
+    993: "gyromitra",
+    994: "stinkhorn, carrion fungus",
+    995: "earthstar",
+    996: "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    997: "bolete",
+    998: "ear, spike, capitulum",
+    999: "toilet tissue, toilet paper, bathroom tissue",
+}
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index cff9d19d323a0a..b4cec28096e840 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -342,12 +342,12 @@ def prepare_img():
 class ViTModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        # TODO: add .from_pretrained()
-        return ViTFeatureExtractor() if is_torchvision_available() else None
+        return (
+            ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_torchvision_available() else None
+        )
 
     @slow
     def test_inference_image_classification_head(self):
-        # TODO: replace namespace to google
         model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
 
         feature_extractor = self.default_feature_extractor