diff --git a/.github/workflows/build_doc_test.yml b/.github/workflows/build_doc_test.yml
deleted file mode 100644
index 06e74278315724..00000000000000
--- a/.github/workflows/build_doc_test.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Documentation test build
-
-on:
-  pull_request:
-    paths:
-      - "src/**"
-      - "docs/**"
-      - ".github/**"
-
-jobs:
-  build_and_package:
-    runs-on: ubuntu-latest
-    defaults:
-      run:
-        shell: bash -l {0}
-
-    steps:
-      - uses: actions/checkout@v2
-      
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: ~/.cache/pip
-          key: v1-test_build_doc
-          restore-keys: |
-            v1-test_build_doc-${{ hashFiles('setup.py') }}
-            v1-test_build_doc
-
-      - name: Setup environment
-        run: |
-          pip install --upgrade pip
-          sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
-
-          pip install git+https://github.com/huggingface/doc-builder
-          pip install .[dev]
-
-          export TORCH_VERSION=$(python -c "from torch import version; print(version.__version__.split('+')[0])")
-          pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH_VERSION}+cpu.html
-
-          pip install torchvision
-          python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
-
-          sudo apt install tesseract-ocr
-          pip install pytesseract
-          pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
-
-      - name: Make documentation
-        run: |
-          doc-builder build transformers ./docs/source
diff --git a/README.md b/README.md
index bf2169cc600f3a..c14ba92a25ba13 100644
--- a/README.md
+++ b/README.md
@@ -280,7 +280,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
+1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
@@ -291,6 +291,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
diff --git a/README_ko.md b/README_ko.md
index 01e4cdc64b9934..59cd805c40a832 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -258,7 +258,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 
+1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
@@ -270,6 +270,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 8474ea1ae29ff2..9d640adab63d97 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -294,6 +294,7 @@ conda install -c huggingface transformers
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
+1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 35158983287b9e..22d3a98e2da192 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -306,6 +306,7 @@ conda install -c huggingface transformers
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
diff --git a/docs/README.md b/docs/README.md
index 62b07e07690f7c..1d06e83de64e43 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -39,8 +39,8 @@ check how they look like before committing for instance). You don't have to comm
 
 ## Building the documentation
 
-Once you have setup the `doc-builder` and additional packages, you can generate the documentation by typing th
-following command:
+Once you have setup the `doc-builder` and additional packages, you can generate the documentation by 
+typing the following command:
 
 ```bash
 doc-builder build transformers docs/source/ --build_dir ~/tmp/test-build
@@ -283,3 +283,71 @@ We have an automatic script running with the `make style` comment that will make
 This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
 recommended to commit your changes before running `make style`, so you can revert the changes done by that script
 easily.
+
+# Testing documentation examples
+
+Good documentation oftens comes with an example of how a specific function or class should be used. 
+Each model class should contain at least one example showcasing
+how to use this model class in inference. *E.g.* the class [Wav2Vec2ForCTC](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC) 
+includes an example of how to transcribe speech to text in the 
+[docstring of its forward function](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC.forward).
+
+## Writing documenation examples
+
+The syntax for Example docstrings can look as follows:
+
+```
+    Example:
+
+    ```python
+    >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+    >>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+    >>> predicted_ids = torch.argmax(logits, dim=-1)
+
+    >>> # transcribe speech
+    >>> transcription = processor.batch_decode(predicted_ids)
+    >>> transcription[0]
+    'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'
+    ```
+```
+
+The docstring should give a minimal, clear example of how the respective model 
+is to be used in inference and also include the expected (ideally sensible)
+output.
+Often, readers will try out the example before even going through the function 
+or class definitions. Therefore it is of utmost importance that the example 
+works as expected.
+
+## Docstring testing
+
+To do so each example should be included in the doctests. 
+We use pytests' [doctest integration](https://docs.pytest.org/doctest.html) to verify that all of our examples run correctly. 
+For Transformers, the doctests are run on a daily basis via GitHub Actions as can be 
+seen [here](https://github.com/huggingface/transformers/actions/workflows/doctests.yml).
+
+To include your example in the daily doctests, you need add the filename that
+contains the example docstring to the [documentation_tests.txt](../utils/documentation_tests.txt).
+You can test the example locally as follows:
+
+- For Python files ending with *.py*:
+```
+pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward -sv --doctest-continue-on-failure
+```
+
+- For Markdown files ending with *.mdx*:
+```
+pytest --doctest-modules docs/source/quicktour.mdx -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
+```
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index dc3f379169729e..3c17a28d4f752a 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -248,6 +248,8 @@
       title: Pegasus
     - local: model_doc/phobert
       title: PhoBERT
+    - local: model_doc/poolformer
+      title: PoolFormer
     - local: model_doc/prophetnet
       title: ProphetNet
     - local: model_doc/qdqbert
diff --git a/docs/source/debugging.mdx b/docs/source/debugging.mdx
index edb3a6ece90592..daf45784369c4e 100644
--- a/docs/source/debugging.mdx
+++ b/docs/source/debugging.mdx
@@ -12,6 +12,35 @@ specific language governing permissions and limitations under the License.
 
 # Debugging
 
+## Multi-GPU Network Issues Debug
+
+When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues.
+
+```bash
+wget https://raw.githubusercontent.com/huggingface/transformers/master/scripts/distributed/torch-distributed-gpu-test.py
+```
+
+For example to test how 2 GPUs interact do:
+
+```bash
+python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+If both processes can talk to each and allocate GPU memory each will print an OK status.
+
+For more GPUs or nodes adjust the arguments in the script.
+
+You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment.
+
+An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows:
+
+```bash
+NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+
+This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue.
+
+
+
 ## Underflow and Overflow Detection
 
 <Tip>
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 39d67e4909b030..e2105d15396ead 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -104,7 +104,7 @@ conversion utilities for the following models.
 1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 1. **[MBart](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
 1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
@@ -115,6 +115,7 @@ conversion utilities for the following models.
 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
@@ -205,7 +206,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|         MaskFormer          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -216,6 +217,7 @@ Flax), PyTorch, and/or TensorFlow.
 |        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/internal/generation_utils.mdx b/docs/source/internal/generation_utils.mdx
index 9eb4abe06d346f..5ee321c0a44248 100644
--- a/docs/source/internal/generation_utils.mdx
+++ b/docs/source/internal/generation_utils.mdx
@@ -148,6 +148,24 @@ generation.
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
+[[autodoc]] TFLogitsProcessor
+    - __call__
+
+[[autodoc]] TFLogitsProcessorList
+    - __call__
+
+[[autodoc]] TFMinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] TFNoBadWordsLogitsProcessor
+    - __call__
+	
+[[autodoc]] TFNoRepeatNGramLogitsProcessor
+    - __call__
+	
+[[autodoc]] TFRepetitionPenaltyLogitsProcessor
+    - __call__
+	
 [[autodoc]] FlaxLogitsProcessor
     - __call__
 
diff --git a/docs/source/main_classes/deepspeed.mdx b/docs/source/main_classes/deepspeed.mdx
index c06c636301ddd7..3646b810aa2108 100644
--- a/docs/source/main_classes/deepspeed.mdx
+++ b/docs/source/main_classes/deepspeed.mdx
@@ -613,6 +613,17 @@ The following is an example of configuration for ZeRO stage 2:
   the slower the communication gets, and the more GPU RAM will be available to other tasks. So if a bigger batch size is
   important, getting a slightly slower training time could be a good trade.
 
+Additionally, `deepspeed==0.4.4` added a new option `round_robin_gradients` which you can enable with:
+
+```json
+{
+    "zero_optimization": {
+        "round_robin_gradients": true
+    }
+}
+```
+
+This is a stage 2 optimization for CPU offloading that parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism).
 
 
 <a id='deepspeed-zero3-config'></a>
@@ -1733,7 +1744,7 @@ Things to consider:
 
 ### Troubleshooting
 
-- `deepspeed` process gets killed at startup without a traceback
+#### the `deepspeed` process gets killed at startup without a traceback
 
 If the `deepspeed` process gets killed at launch time without a traceback, that usually means that the program tried
 to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that
@@ -1742,7 +1753,49 @@ both configured to offload to `cpu`. If you have NVMe, experiment with offloadin
 ZeRO-3. Here is how you can [estimate how much memory is needed for a specific model](https://deepspeed.readthedocs.io/en/latest/memory.html).
 
 
+#### training and/or eval/predict loss is `NaN`
+
+This often happens when one takes a model pre-trained in bf16 mixed precision mode and tries to use it under fp16 (with or without mixed precision). Most models trained on TPU and often the ones released by Google are in this category (e.g. almost all t5-based models). Here the solution is to either use fp32 or bf16 if your hardware supports it (TPU, Ampere GPUs or newer).
+
+The other problem may have to do with using fp16. When you configure this section:
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+and you see in your log that Deepspeed reports `OVERFLOW!` as follows:
+
+```
+0%|                                                                                                                             | 0/189 [00:00<?, ?it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144
+  1%|▌                                                                                                                    | 1/189 [00:00<01:26,  2.17it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0
+  1%|█▏
+ [...]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 14%|████████████████▌                                                                                                   | 27/189 [00:14<01:13,  2.21it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 15%|█████████████████▏                                                                                                  | 28/189 [00:14<01:13,  2.18it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 15%|█████████████████▊                                                                                                  | 29/189 [00:15<01:13,  2.18it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+[...]
+```
+
+that means that the Deepspeed loss scaler can't figure out a scaling co-efficient that overcomes loss overflow.
+
+(the log was massaged to be more readable here.)
 
+In this case you usually need to raise the value of `initial_scale_power`. Setting it to `"initial_scale_power": 32` will typically resolve the problem.
 
 
 
@@ -1805,7 +1858,7 @@ Please note that if you're not using the [`Trainer`] integration, you're complet
 [[autodoc]] deepspeed.HfDeepSpeedConfig
     - all
 
-### DeepSpeed ZeRO Inference
+### Custom DeepSpeed ZeRO Inference
 
 Here is an example of how one could do DeepSpeed ZeRO Inference without using [`Trainer`] when one can't fit a model onto a single GPU. The solution includes using additional GPUs or/and offloading GPU memory to CPU memory.
 
diff --git a/docs/source/main_classes/logging.mdx b/docs/source/main_classes/logging.mdx
index ac0717443daee5..3137be805cadc4 100644
--- a/docs/source/main_classes/logging.mdx
+++ b/docs/source/main_classes/logging.mdx
@@ -36,11 +36,34 @@ Additionally, some `warnings` can be disabled by setting the environment variabl
 `TRANSFORMERS_NO_ADVISORY_WARNINGS` to a true value, like *1*. This will disable any warning that is logged using
 [`logger.warning_advice`]. For example:
 
-
 ```bash
 TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
 ```
 
+Here is an example of how to use `logging` in a module:
+
+```python
+from transformers.utils import logging
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+logger.info("INFO")
+logger.warning("WARN")
+```
+
+Above, a `logger` instance is created from `logging.get_logger(__name__)`. If you want to use `logging` in a script, you shouldn't pass `__name__` to `logging.get_logger`. For example:
+
+```python
+from transformers.utils import logging
+
+if __name__ == "__main__":
+    logging.set_verbosity_info()
+    # leave it empy or use a string
+    logger = logging.get_logger()
+    logger.info("INFO")
+    logger.warning("WARN")
+```
+
 All the methods of this logging module are documented below, the main ones are
 [`logging.get_verbosity`] to get the current level of verbosity in the logger and
 [`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least
@@ -54,7 +77,7 @@ verbose to the most verbose), those levels (with their corresponding int values
 - `transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
 - `transformers.logging.DEBUG` (int value, 10): report all information.
 
-By default, `tqdm` progress bars will be displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior. 
+By default, `tqdm` progress bars will be displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior.
 
 ## Base setters
 
diff --git a/docs/source/main_classes/pipelines.mdx b/docs/source/main_classes/pipelines.mdx
index 5b2509c70d508d..6f5b5b74706591 100644
--- a/docs/source/main_classes/pipelines.mdx
+++ b/docs/source/main_classes/pipelines.mdx
@@ -78,7 +78,7 @@ GPU. If it doesn't don't hesitate to create an issue.
 ```python
 import datasets
 from transformers import pipeline
-from transformers.pipelines.base import KeyDataset
+from transformers.pipelines.pt_utils import KeyDataset
 from tqdm.auto import tqdm
 
 pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
@@ -128,7 +128,7 @@ whenever the pipeline uses its streaming ability (so when passing lists or `Data
 
 ```python
 from transformers import pipeline
-from transformers.pipelines.base import KeyDataset
+from transformers.pipelines.pt_utils import KeyDataset
 import datasets
 
 dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
diff --git a/docs/source/model_doc/ctrl.mdx b/docs/source/model_doc/ctrl.mdx
index acb91ed6e5cda3..6f35e487e148a1 100644
--- a/docs/source/model_doc/ctrl.mdx
+++ b/docs/source/model_doc/ctrl.mdx
@@ -38,10 +38,10 @@ Tips:
 - CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
   token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as it can be
   observed in the *run_generation.py* example script.
-- The PyTorch models can take the *past* as input, which is the previously computed key/value attention pairs. Using
-  this *past* value prevents the model from re-computing pre-computed values in the context of text generation. See
-  [reusing the past in generative models](../quickstart#using-the-past) for more information on the usage of
-  this argument.
+- The PyTorch models can take the `past_key_values` as input, which is the previously computed key/value attention pairs.
+  TensorFlow models accepts `past` as input. Using the `past_key_values` value prevents the model from re-computing
+  pre-computed values in the context of text generation. See the [`forward`](model_doc/ctrl#transformers.CTRLModel.forward)
+  method for more information on the usage of this argument.
 
 This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitishr). The original code can be found
 [here](https://github.com/salesforce/ctrl).
diff --git a/docs/source/model_doc/maskformer.mdx b/docs/source/model_doc/maskformer.mdx
index 1da7244c3ef09f..cfddfba8b7d7f2 100644
--- a/docs/source/model_doc/maskformer.mdx
+++ b/docs/source/model_doc/maskformer.mdx
@@ -14,17 +14,24 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The MaskFormer model was proposed in [Per-Pixel Classification is Not All You Need for Semantic Segmentation>(https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
+The MaskFormer model was proposed in [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov. MaskFormer addresses semantic segmentation with a mask classification paradigm instead of performing classic pixel-level classification.
 
 The abstract from the paper is the following:
 
-_Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models._
+*Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.*
 
 Tips:
-
-- One can use the [`AutoFeatureExtractor`] API to prepare images for the model.
-
-This model was contributed by [francesco](<https://huggingface.co/francesco). The original code can be found [here](https://github.com/facebookresearch/MaskFormer).
+- During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help
+  the model output the correct number of objects of each class. If you set the parameter `use_auxilary_loss` of
+  [`~transformers.MaskFormerConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses
+  are added after each decoder layer (with the FFNs sharing parameters).
+- If you want to train the model in a distributed environment across multiple nodes, then one should update the
+  _get_num_masks_ function inside in the _MaskFormerLoss_ class of _modeling_maskformer.py_. When training on multiple nodes, this should be
+  set to the average number of target masks across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/MaskFormer/blob/da3e60d85fdeedcb31476b5edd7d328826ce56cc/mask_former/modeling/criterion.py#L169).
+- One can use [`MaskFormerFeatureExtractor`] to prepare images for the model.
+- To get the final segmentation, depending on the task, you can call [`MaskFormerFeatureExtractor.post_process_semantic_segmentation`] or [`MaskFormerFeatureExtractor.post_process_panoptic_segmentation`].
+
+This model was contributed by [francesco](https://huggingface.co/francesco). The original code can be found [here](https://github.com/facebookresearch/MaskFormer).
 
 ## MaskFormer specific outputs
 
@@ -38,8 +45,10 @@ This model was contributed by [francesco](<https://huggingface.co/francesco). Th
 
 ## MaskFormerModel
 
-[[autodoc]] MaskFormerModel - forward
+[[autodoc]] MaskFormerModel
+    - forward
 
 ## MaskFormerForInstanceSegmentation
 
-[[autodoc]] MaskFormerForInstanceSegmentation - forward
+[[autodoc]] MaskFormerForInstanceSegmentation
+    - forward
diff --git a/docs/source/model_doc/poolformer.mdx b/docs/source/model_doc/poolformer.mdx
new file mode 100644
index 00000000000000..a3f9a3b7ba11a5
--- /dev/null
+++ b/docs/source/model_doc/poolformer.mdx
@@ -0,0 +1,61 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# PoolFormer
+
+## Overview
+
+The PoolFormer model was proposed in [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)  by Sea AI Labs. Instead of designing complicated token mixer to achieve SOTA performance, the target of this work is to demonstrate the competence of transformer models largely stem from the general architecture MetaFormer.
+
+The abstract from the paper is the following:
+
+*Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only the most basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 48%/60% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design.*
+
+The figure below illustrates the architecture of SegFormer. Taken from the [original paper](https://arxiv.org/abs/2111.11418).
+
+<img width="600" src="https://user-images.githubusercontent.com/15921929/142746124-1ab7635d-2536-4a0e-ad43-b4fe2c5a525d.png"/>
+
+
+Tips:
+
+- PoolFormer has a hierarchical architecture, where instead of Attention, a simple Average Pooling layer is present. All checkpoints of the model can be found on the [hub](https://huggingface.co/models?other=poolformer).
+- One can use [`PoolFormerFeatureExtractor`] to prepare images for the model.
+- As most models, PoolFormer comes in different sizes, the details of which can be found in the table below.
+
+| **Model variant** | **Depths**    | **Hidden sizes**    | **Params (M)** | **ImageNet-1k Top 1** |
+| :---------------: | ------------- | ------------------- | :------------: | :-------------------: |
+| s12               | [2, 2, 6, 2]  | [64, 128, 320, 512] | 12             | 77.2                  |
+| s24               | [4, 4, 12, 4] | [64, 128, 320, 512] | 21             | 80.3                  |
+| s36               | [6, 6, 18, 6] | [64, 128, 320, 512] | 31             | 81.4                  |
+| m36               | [6, 6, 18, 6] | [96, 192, 384, 768] | 56             | 82.1                  |
+| m48               | [8, 8, 24, 8] | [96, 192, 384, 768] | 73             | 82.5                  |
+
+This model was contributed by [heytanay](https://huggingface.co/heytanay). The original code can be found [here](https://github.com/sail-sg/poolformer).
+
+## PoolFormerConfig
+
+[[autodoc]] PoolFormerConfig
+
+## PoolFormerFeatureExtractor
+
+[[autodoc]] PoolFormerFeatureExtractor
+    - __call__
+
+## PoolFormerModel
+
+[[autodoc]] PoolFormerModel
+    - forward
+    
+## PoolFormerForImageClassification
+
+[[autodoc]] PoolFormerForImageClassification
+    - forward
\ No newline at end of file
diff --git a/docs/source/model_doc/speech_to_text_2.mdx b/docs/source/model_doc/speech_to_text_2.mdx
index 62509b67ec3203..72754b67aab9a2 100644
--- a/docs/source/model_doc/speech_to_text_2.mdx
+++ b/docs/source/model_doc/speech_to_text_2.mdx
@@ -69,7 +69,7 @@ predicted token ids.
 >>> ds = ds.map(map_to_array)
 
 >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
->>> generated_ids = model.generate(input_ids=inputs["input_values"], attention_mask=inputs["attention_mask"])
+>>> generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"])
 
 >>> transcription = processor.batch_decode(generated_ids)
 ```
diff --git a/docs/source/model_doc/t5.mdx b/docs/source/model_doc/t5.mdx
index 47bcdc662f0511..dbcfaf1c7dc7fe 100644
--- a/docs/source/model_doc/t5.mdx
+++ b/docs/source/model_doc/t5.mdx
@@ -263,6 +263,11 @@ print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
 
 <a id='scripts'></a>
 
+## Performance
+
+If you'd like a faster training and inference performance, install [apex](https://github.com/NVIDIA/apex#quick-start) and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter.
+
+
 ## Example scripts
 
 T5 is supported by several example scripts, both for pre-training and fine-tuning.
diff --git a/docs/source/model_doc/vision-encoder-decoder.mdx b/docs/source/model_doc/vision-encoder-decoder.mdx
index d766075a7355b1..987924d4ad7c03 100644
--- a/docs/source/model_doc/vision-encoder-decoder.mdx
+++ b/docs/source/model_doc/vision-encoder-decoder.mdx
@@ -13,8 +13,8 @@ specific language governing permissions and limitations under the License.
 # Vision Encoder Decoder Models
 
 The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text-sequence model with any
-pretrained vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit))
-and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert)).
+pretrained Transformer-based vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin))
+and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)).
 
 The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for
 example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index 867170656ecdf7..1c4077c549995d 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -118,7 +118,7 @@ Next, load a dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co
 Now you can iterate over the dataset with the pipeline. `KeyDataset` retrieves the item in the dictionary returned by the dataset:
 
 ```py
->>> from transformers.pipelines.base import KeyDataset
+>>> from transformers.pipelines.pt_utils import KeyDataset
 >>> from tqdm.auto import tqdm
 
 >>> for out in tqdm(speech_recognizer(KeyDataset(dataset, "file"))):
@@ -316,4 +316,4 @@ One particularly cool 🤗 Transformers feature is the ability to save a model a
 
 >>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
 >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
\ No newline at end of file
+```
diff --git a/docs/source/tokenizer_summary.mdx b/docs/source/tokenizer_summary.mdx
index 401c620d008bb6..78278390302b0a 100644
--- a/docs/source/tokenizer_summary.mdx
+++ b/docs/source/tokenizer_summary.mdx
@@ -219,7 +219,7 @@ equivalent to finding the symbol pair, whose probability divided by the probabil
 its second symbol is the greatest among all symbol pairs. *E.g.* `"u"`, followed by `"g"` would have only been
 merged if the probability of `"ug"` divided by `"u"`, `"g"` would have been greater than for any other symbol
 pair. Intuitively, WordPiece is slightly different to BPE in that it evaluates what it _loses_ by merging two symbols
-to make ensure it's _worth it_.
+to ensure it's _worth it_.
 
 <a id='unigram'></a>
 
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index 42532239083330..35c450a9414e9a 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -43,6 +43,7 @@ Coming soon!
 | [**`speech-recognition`**](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition) | TIMIT | ✅ | - |✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/speech_recognition.ipynb)
 | [**`multi-lingual speech-recognition`**](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition) | Common Voice | ✅ | - |✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multi_lingual_speech_recognition.ipynb)
 | [**`audio-classification`**](https://github.com/huggingface/transformers/tree/master/examples/pytorch/audio-classification) | SUPERB KS | ✅ | - |✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/audio_classification.ipynb)
+| [**`image-classification`**](https://github.com/huggingface/notebooks/blob/master/examples/image_classification.ipynb) | CIFAR-10 | ✅ | - |✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/image_classification.ipynb)
 
 
 ## Running quick tests
diff --git a/examples/research_projects/pplm/README.md b/examples/research_projects/pplm/README.md
index 204500879fc43b..f37ea8e96f216d 100644
--- a/examples/research_projects/pplm/README.md
+++ b/examples/research_projects/pplm/README.md
@@ -10,6 +10,9 @@ Blog link: https://eng.uber.com/pplm
 
 Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
 
+# Note
+
+⚠️ This project should be run with pytorch-lightning==1.0.4 which has a potential security vulnerability
 
 ## Setup
 
@@ -20,7 +23,7 @@ pip install nltk torchtext # additional requirements.
 cd examples/research_projects/pplm
 ```
 
-## PPLM-BoW 
+## PPLM-BoW
 
 ### Example command for bag-of-words control
 
@@ -30,7 +33,7 @@ python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5
 
 ### Tuning hyperparameters for bag-of-words control
 
-1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
+1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model.
 
 2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
 	a) Reduce the `--stepsize` </br>
@@ -48,7 +51,6 @@ python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --leng
 
 ### Tuning hyperparameters for discriminator control
 
-1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
+1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model.
 
 2. Use `--class_label 3` for negative, and `--class_label 2` for positive
-
diff --git a/examples/research_projects/pplm/requirements.txt b/examples/research_projects/pplm/requirements.txt
index 62092cc300ac44..70530cd79983a7 100644
--- a/examples/research_projects/pplm/requirements.txt
+++ b/examples/research_projects/pplm/requirements.txt
@@ -5,7 +5,7 @@ psutil
 sacrebleu
 rouge-score
 tensorflow_datasets
-pytorch-lightning==1.0.4
+pytorch-lightning
 matplotlib
 git-python==1.0.3
 faiss-cpu
diff --git a/examples/research_projects/rag-end2end-retriever/README.md b/examples/research_projects/rag-end2end-retriever/README.md
index 7f6ef0bd6591da..dcb615918c2fcd 100644
--- a/examples/research_projects/rag-end2end-retriever/README.md
+++ b/examples/research_projects/rag-end2end-retriever/README.md
@@ -2,29 +2,32 @@
 
 This finetuning script is actively maintained by [Shamane Siri](https://github.com/shamanez). Feel free to ask questions on the [Forum](https://discuss.huggingface.co/) or post an issue on [GitHub](https://github.com/huggingface/transformers/issues/new/choose) and tag @shamanez.
 
-Others that helped out: Patrick von Platen (@patrickvonplaten), Quentin Lhoest (@lhoestq), and Rivindu Weerasekera (@rivinduw) 
+Others that helped out: Patrick von Platen (@patrickvonplaten), Quentin Lhoest (@lhoestq), and Rivindu Weerasekera (@rivinduw)
 
-The original RAG implementation is able to train the question encoder and generator end-to-end. 
-This extension enables complete end-to-end training of RAG including the context encoder in the retriever component. 
+The original RAG implementation is able to train the question encoder and generator end-to-end.
+This extension enables complete end-to-end training of RAG including the context encoder in the retriever component.
 Please read the [accompanying blog post](https://shamanesiri.medium.com/how-to-finetune-the-entire-rag-architecture-including-dpr-retriever-4b4385322552) for details on this implementation.
 
 The original RAG code has also been modified to work with the latest versions of pytorch lightning (version 1.2.10) and RAY (version 1.3.0). All other implementation details remain the same as the [original RAG code](https://github.com/huggingface/transformers/tree/master/examples/research_projects/rag).
 Read more about RAG  at https://arxiv.org/abs/2005.11401.
 
-This code can be modified to experiment with other research on retrival augmented models which include training of the retriever (e.g. [REALM](https://arxiv.org/abs/2002.08909) and [MARGE](https://arxiv.org/abs/2006.15020)). 
+This code can be modified to experiment with other research on retrival augmented models which include training of the retriever (e.g. [REALM](https://arxiv.org/abs/2002.08909) and [MARGE](https://arxiv.org/abs/2006.15020)).
 
-To start training, use the bash script (finetune_rag_ray_end2end.sh) in this folder. This script also includes descriptions on each command-line argument used. 
+To start training, use the bash script (finetune_rag_ray_end2end.sh) in this folder. This script also includes descriptions on each command-line argument used.
 
+# Note
+
+⚠️ This project should be run with pytorch-lightning==1.3.1 which has a potential security vulnerability
 
 # Testing
 
 The following two bash scripts can be used to quickly test the implementation.
-1. sh ./test_run/test_rag_new_features.sh 
-    - Tests the newly added functions (set_context_encoder and set_context_encoder_tokenizer) related to modeling rag. 
+1. sh ./test_run/test_rag_new_features.sh
+    - Tests the newly added functions (set_context_encoder and set_context_encoder_tokenizer) related to modeling rag.
     - This is sufficient to check the model's ability to use the set functions correctly.
 2. sh ./test_run/test_finetune.sh script
     - Tests the full end-to-end fine-tuning ability with a dummy knowlendge-base and dummy training dataset (check test_dir directory).
-    - Users can replace the dummy dataset and knowledge-base with their own to do their own finetuning. 
+    - Users can replace the dummy dataset and knowledge-base with their own to do their own finetuning.
 
 
 # Comparison of end2end RAG (including DPR finetuning)  VS original-RAG
@@ -34,14 +37,14 @@ We conducted a simple experiment to investigate the effectiveness of this end2en
 -   Create a knowledge-base using all the context passages in the SQuAD dataset with their respective titles.
 -   Use the question-answer pairs as training data.
 -   Train the system for 10 epochs.
--   Test the Exact Match (EM) score with the SQuAD dataset's validation set. 
--   Training dataset, the knowledge-base, and hyperparameters used in experiments can be accessed from [here](https://drive.google.com/drive/folders/1qyzV-PaEARWvaU_jjpnU_NUS3U_dSjtG?usp=sharing). 
+-   Test the Exact Match (EM) score with the SQuAD dataset's validation set.
+-   Training dataset, the knowledge-base, and hyperparameters used in experiments can be accessed from [here](https://drive.google.com/drive/folders/1qyzV-PaEARWvaU_jjpnU_NUS3U_dSjtG?usp=sharing).
 
-# Results 
+# Results
 
-- We train both models for 10 epochs. 
+- We train both models for 10 epochs.
 
 | Model Type          | EM-Score|
-| --------------------| --------| 
+| --------------------| --------|
 | RAG-original        | 28.12   |
-| RAG-end2end with DPR| 40.02   | 
+| RAG-end2end with DPR| 40.02   |
diff --git a/examples/research_projects/rag-end2end-retriever/requirements.txt b/examples/research_projects/rag-end2end-retriever/requirements.txt
index 473d972761e312..aca89c78e88c0d 100644
--- a/examples/research_projects/rag-end2end-retriever/requirements.txt
+++ b/examples/research_projects/rag-end2end-retriever/requirements.txt
@@ -2,6 +2,6 @@ faiss-cpu >= 1.7.0
 datasets >= 1.6.2
 psutil >= 5.7.0
 torch >= 1.4.0
-pytorch-lightning ==  1.3.1
+pytorch-lightning
 nvidia-ml-py3 == 7.352.0
 ray >=  1.3.0
diff --git a/examples/research_projects/rag/README.md b/examples/research_projects/rag/README.md
index 74a1ab0bf93fa0..b7b17d731bb10e 100644
--- a/examples/research_projects/rag/README.md
+++ b/examples/research_projects/rag/README.md
@@ -11,6 +11,10 @@ Such contextualized inputs are passed to the generator.
 
 Read more about RAG  at https://arxiv.org/abs/2005.11401.
 
+# Note
+
+⚠️ This project should be run with pytorch-lightning==1.3.1 which has a potential security vulnerability
+
 # Finetuning
 
 Our finetuning logic is based on scripts from [`examples/seq2seq`](https://github.com/huggingface/transformers/tree/master/examples/seq2seq). We accept training data in the same format as specified there - we expect a directory consisting of 6 text files:
@@ -52,8 +56,8 @@ You will then be able to pass `path/to/checkpoint` as `model_name_or_path` to th
 
 ## Document Retrieval
 When running distributed fine-tuning, each training worker needs to retrieve contextual documents
-for its input by querying a index loaded into memory. RAG provides two implementations for document retrieval, 
-one with [`torch.distributed`](https://pytorch.org/docs/stable/distributed.html) communication package and the other 
+for its input by querying a index loaded into memory. RAG provides two implementations for document retrieval,
+one with [`torch.distributed`](https://pytorch.org/docs/stable/distributed.html) communication package and the other
 with [`Ray`](https://docs.ray.io/en/master/).
 
 This option can be configured with the `--distributed_retriever` flag which can either be set to `pytorch` or `ray`.
@@ -62,7 +66,7 @@ By default this flag is set to `pytorch`.
 For the Pytorch implementation, only training worker 0 loads the index into CPU memory, and a gather/scatter pattern is used
 to collect the inputs from the other training workers and send back the corresponding document embeddings.
 
-For the Ray implementation, the index is loaded in *separate* process(es). The training workers randomly select which 
+For the Ray implementation, the index is loaded in *separate* process(es). The training workers randomly select which
 retriever worker to query. To use Ray for distributed retrieval, you have to set the `--distributed_retriever` arg to `ray`.
 To configure the number of retrieval workers (the number of processes that load the index), you can set the `num_retrieval_workers` flag.
 Also make sure to start the Ray cluster before running fine-tuning.
@@ -119,7 +123,7 @@ We demonstrate how to evaluate retrieval against DPR evaluation data. You can do
         --gold_data_path output/biencoder-nq-dev.pages
     ```
 3. Run evaluation:
-    ```bash    
+    ```bash
     python examples/research_projects/rag/eval_rag.py \
         --model_name_or_path facebook/rag-sequence-nq \
         --model_type rag_sequence \
@@ -139,7 +143,7 @@ We demonstrate how to evaluate retrieval against DPR evaluation data. You can do
         --predictions_path output/retrieval_preds.tsv  \ # name of file where predictions will be stored
         --eval_mode retrieval \ # indicates whether we're performing retrieval evaluation or e2e evaluation
         --k 1 # parameter k for the precision@k metric
-   
+
     ```
 ## End-to-end evaluation
 
@@ -153,8 +157,8 @@ who is the owner of reading football club	['Xiu Li Dai', 'Dai Yongge', 'Dai Xiul
 Xiu Li Dai
 ```
 
-Predictions of the model for the samples from the `evaluation_set` will be saved under the path specified by the `predictions_path` parameter. 
-If this path already exists, the script will use saved predictions to calculate metrics. 
+Predictions of the model for the samples from the `evaluation_set` will be saved under the path specified by the `predictions_path` parameter.
+If this path already exists, the script will use saved predictions to calculate metrics.
 Add `--recalculate` parameter to force the script to perform inference from scratch.
 
 An example e2e evaluation run could look as follows:
@@ -196,4 +200,4 @@ python examples/research_projects/rag/finetune_rag.py \
     --index_name custom
     --passages_path path/to/data/my_knowledge_dataset
     --index_path path/to/my_knowledge_dataset_hnsw_index.faiss
-```
\ No newline at end of file
+```
diff --git a/examples/research_projects/rag/callbacks_rag.py b/examples/research_projects/rag/callbacks_rag.py
index e9eda20de300fc..a2d87f82247c4a 100644
--- a/examples/research_projects/rag/callbacks_rag.py
+++ b/examples/research_projects/rag/callbacks_rag.py
@@ -38,7 +38,7 @@ def get_checkpoint_callback(output_dir, metric):
         monitor=f"val_{metric}",
         mode="max",
         save_top_k=3,
-        period=1,  # maybe save a checkpoint every time val is run, not just end of epoch.
+        every_n_epochs=1,  # maybe save a checkpoint every time val is run, not just end of epoch.
     )
     return checkpoint_callback
 
diff --git a/examples/research_projects/rag/finetune_rag.py b/examples/research_projects/rag/finetune_rag.py
index a1721623dd60cc..2fd4ef7659c543 100644
--- a/examples/research_projects/rag/finetune_rag.py
+++ b/examples/research_projects/rag/finetune_rag.py
@@ -254,7 +254,7 @@ def pad(self) -> int:
     def training_step(self, batch, batch_idx) -> Dict:
         loss_tensors = self._step(batch)
 
-        logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
+        logs = {name: loss.detach() for name, loss in zip(self.loss_names, loss_tensors)}
         # tokens per batch
         tgt_pad_token_id = (
             self.tokenizer.generator.pad_token_id
@@ -517,7 +517,7 @@ def main(args=None, model=None) -> GenerativeQAModule:
             raise RuntimeError("Please install Ray to use the Ray " "distributed retriever.")
         # Connect to an existing Ray cluster.
         try:
-            ray.init(address=args.ray_address)
+            ray.init(address=args.ray_address, namespace="rag")
         except (ConnectionError, ValueError):
             logger.warning(
                 "Connection to Ray cluster failed. Make sure a Ray"
diff --git a/examples/research_projects/rag/lightning_base.py b/examples/research_projects/rag/lightning_base.py
index 0d93626677cc48..1e0f67627e7c34 100644
--- a/examples/research_projects/rag/lightning_base.py
+++ b/examples/research_projects/rag/lightning_base.py
@@ -266,6 +266,15 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--adafactor", action="store_true")
 
 
+class InitCallback(pl.Callback):
+    # This method is better that using a custom DDP plugging with the latest pytorch-lightning (@shamanez)
+    def on_sanity_check_start(self, trainer, pl_module):
+        if (
+            trainer.is_global_zero and trainer.global_rank == 0
+        ):  # we initialize the retriever only on master worker with RAY. In new pytorch-lightning accelorators are removed.
+            pl_module.model.rag.retriever.init_retrieval()  # better to use hook functions.
+
+
 class LoggingCallback(pl.Callback):
     def on_batch_end(self, trainer, pl_module):
         lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
@@ -368,19 +377,21 @@ def generic_train(
     # TODO: remove with PyTorch 1.6 since pl uses native amp
     if args.fp16:
         train_params["precision"] = 16
-        train_params["amp_level"] = args.fp16_opt_level
+        # train_params["amp_level"] = args.fp16_opt_level
 
     if args.gpus > 1:
-        train_params["accelerator"] = "ddp"
+        train_params["accelerator"] = "auto"  # "ddp"
+        train_params["strategy"] = "ddp"
 
     train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
     train_params["profiler"] = None  # extra_train_kwargs.get("profiler", None) #get unwanted logs
+    train_params["devices"] = "auto"
 
     trainer = pl.Trainer.from_argparse_args(
         args,
         weights_summary=None,
-        callbacks=[logging_callback] + extra_callbacks + [checkpoint_callback],
-        plugins=[custom_ddp_plugin],
+        callbacks=[logging_callback] + extra_callbacks + [checkpoint_callback] + [InitCallback()],
+        # plugins=[custom_ddp_plugin],
         logger=logger,
         **train_params,
     )
diff --git a/examples/research_projects/rag/requirements.txt b/examples/research_projects/rag/requirements.txt
index ef065e36e1c983..fdeb5567d24d55 100644
--- a/examples/research_projects/rag/requirements.txt
+++ b/examples/research_projects/rag/requirements.txt
@@ -2,6 +2,7 @@ faiss-cpu >= 1.6.3
 datasets >= 1.0.1
 psutil >= 5.7.0
 torch >= 1.4.0
+ray >= 1.10.0
+pytorch-lightning >= 1.5.10
 transformers
-pytorch-lightning==1.3.1
 GitPython
\ No newline at end of file
diff --git a/examples/research_projects/seq2seq-distillation/README.md b/examples/research_projects/seq2seq-distillation/README.md
index 8157f753f8ecb7..62c38bfd71402f 100644
--- a/examples/research_projects/seq2seq-distillation/README.md
+++ b/examples/research_projects/seq2seq-distillation/README.md
@@ -13,6 +13,10 @@ Author: Sam Shleifer (https://github.com/sshleifer)
 - `FSMTForConditionalGeneration`
 - `T5ForConditionalGeneration`
 
+# Note
+
+⚠️ This project should be run with pytorch-lightning==1.0.4 which has a potential security vulnerability
+
 ## Datasets
 
 #### XSUM
@@ -62,7 +66,7 @@ https://github.com/huggingface/transformers/tree/master/scripts/fsmt
 
 #### Pegasus (multiple datasets)
 
-Multiple eval datasets are available for download from: 
+Multiple eval datasets are available for download from:
 https://github.com/stas00/porting/tree/master/datasets/pegasus
 
 
@@ -210,7 +214,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained(f'{output_dir}/best_tfmr')
 ### Converting pytorch-lightning checkpoints
 pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
 
-This should be done for you, with a file called `{save_dir}/best_tfmr`. 
+This should be done for you, with a file called `{save_dir}/best_tfmr`.
 
 If that file doesn't exist but you have a lightning `.ckpt` file, you can run
 ```bash
@@ -219,7 +223,7 @@ python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT  randomly_initialized_hf_mode
 Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
 
 
-# Experimental Features 
+# Experimental Features
 These features are harder to use and not always useful.
 
 ###  Dynamic Batch Size for MT
@@ -230,7 +234,7 @@ This feature can only be used:
 - without sortish sampler
 - after calling `./save_len_file.py $tok $data_dir`
 
-For example, 
+For example,
 ```bash
 ./save_len_file.py Helsinki-NLP/opus-mt-en-ro  wmt_en_ro
 ./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
@@ -254,10 +258,10 @@ This section describes all code and artifacts from our [Paper](http://arxiv.org/
 ![DBART](https://huggingface.co/front/thumbnails/distilbart_large.png)
 
 + For the CNN/DailyMail dataset, (relatively longer, more extractive summaries), we found a simple technique that works, which we call "Shrink and Fine-tune", or SFT.
-you just copy alternating layers from `facebook/bart-large-cnn` and fine-tune more on the cnn/dm data. `sshleifer/distill-pegasus-cnn-16-4`, `sshleifer/distilbart-cnn-12-6` and all other checkpoints under `sshleifer` that start with `distilbart-cnn` were trained this way. 
+you just copy alternating layers from `facebook/bart-large-cnn` and fine-tune more on the cnn/dm data. `sshleifer/distill-pegasus-cnn-16-4`, `sshleifer/distilbart-cnn-12-6` and all other checkpoints under `sshleifer` that start with `distilbart-cnn` were trained this way.
 + For the XSUM dataset, training on pseudo-labels worked best for Pegasus (`sshleifer/distill-pegasus-16-4`), while training with KD worked best for `distilbart-xsum-12-6`
 + For `sshleifer/dbart-xsum-12-3`
-+ We ran 100s experiments, and didn't want to document 100s of commands. If you want a command to replicate a figure from the paper that is not documented below, feel free to ask on the [forums](https://discuss.huggingface.co/t/seq2seq-distillation-methodology-questions/1270) and tag `@sshleifer`. 
++ We ran 100s experiments, and didn't want to document 100s of commands. If you want a command to replicate a figure from the paper that is not documented below, feel free to ask on the [forums](https://discuss.huggingface.co/t/seq2seq-distillation-methodology-questions/1270) and tag `@sshleifer`.
 + You can see the performance tradeoffs of model sizes [here](https://docs.google.com/spreadsheets/d/1EkhDMwVO02m8jCD1cG3RoFPLicpcL1GQHTQjfvDYgIM/edit#gid=0).
 and more granular timing results [here](https://docs.google.com/spreadsheets/d/1EkhDMwVO02m8jCD1cG3RoFPLicpcL1GQHTQjfvDYgIM/edit#gid=1753259047&range=B2:I23).
 
@@ -303,10 +307,10 @@ deval 1 sshleifer/distill-pegasus-xsum-16-4 xsum dpx_xsum_eval
 + Find a teacher model [Pegasus](https://huggingface.co/models?search=pegasus) (slower, better ROUGE) or `facebook/bart-large-xsum`/`facebook/bart-large-cnn` (faster, slightly lower.).
 Choose the checkpoint where the corresponding dataset is most similar (or identical to) your dataset.
 + Follow the sections in order below. You can stop after SFT if you are satisfied, or move on to pseudo-labeling if you want more performance.
-+ student size: If you want a close to free 50% speedup, cut the decoder in half. If you want a larger speedup, cut it in 4. 
++ student size: If you want a close to free 50% speedup, cut the decoder in half. If you want a larger speedup, cut it in 4.
 + If your SFT run starts at a validation ROUGE-2 that is more than 10 pts below the teacher's validation ROUGE-2,  you have a bug. Switching to a more expensive technique will not help. Try setting a breakpoint and looking at generation and truncation defaults/hyper-parameters, and share your experience on the forums!
 
-  
+
 #### Initialization
 We use [make_student.py](./make_student.py) to copy alternating layers from the teacher, and save the resulting model to disk
 ```bash
@@ -319,7 +323,7 @@ python make_student.py google/pegasus-xsum --save_path dpx_xsum_16_4  --e 16 --d
 we now have an initialized student saved to  `dbart_xsum_12_3`, which we will use for the following commands.
 + Extension: To replicate more complicated initialize experiments in section 6.1, or try your own. Use the `create_student_by_copying_alternating_layers` function.
 
-#### Pegasus 
+#### Pegasus
 + The following commands are written for BART and will require, at minimum, the following modifications
 + reduce batch size, and increase gradient accumulation steps so that the product `gpus * batch size * gradient_accumulation_steps = 256`. We used `--learning-rate` = 1e-4 * gradient accumulation steps.
 + don't use fp16
@@ -379,7 +383,7 @@ python finetune.py \
   --output_dir dbart_xsum_12_3_PL --gpus 1 --logger_name wandb
 ```
 
- 
+
 
 To combine datasets, as in Section 6.2, try something like:
 ```bash
@@ -413,7 +417,7 @@ The command that produced `sshleifer/distilbart-xsum-12-6` is at [./train_distil
 
 ```bibtex
 @misc{shleifer2020pretrained,
-      title={Pre-trained Summarization Distillation}, 
+      title={Pre-trained Summarization Distillation},
       author={Sam Shleifer and Alexander M. Rush},
       year={2020},
       eprint={2010.13002},
diff --git a/examples/research_projects/seq2seq-distillation/requirements.txt b/examples/research_projects/seq2seq-distillation/requirements.txt
index 0cd973d4d5ca7e..533f6339ab0898 100644
--- a/examples/research_projects/seq2seq-distillation/requirements.txt
+++ b/examples/research_projects/seq2seq-distillation/requirements.txt
@@ -4,7 +4,7 @@ psutil
 sacrebleu
 rouge-score
 tensorflow_datasets
-pytorch-lightning==1.0.4
+pytorch-lightning
 matplotlib
 git-python==1.0.3
 faiss-cpu
diff --git a/notebooks/README.md b/notebooks/README.md
index f2d2b9811630cf..22686cb2be757c 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -29,59 +29,60 @@ Pull Request so it can be included under the Community notebooks.
 
 You can open any page of the documentation as a notebook in colab (there is a button directly on said pages) but they are also listed here if you need to:
 
-| Notebook     |      Description      |   |
-|:----------|:-------------|------:|
-| [Quicktour of the library](https://github.com/huggingface/notebooks/blob/master/transformers_doc/quicktour.ipynb)  | A presentation of the various APIs in Transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/quicktour.ipynb) |
-| [Summary of the tasks](https://github.com/huggingface/notebooks/blob/master/transformers_doc/task_summary.ipynb)  | How to run the models of the Transformers library task by task | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/task_summary.ipynb) |
-| [Preprocessing data](https://github.com/huggingface/notebooks/blob/master/transformers_doc/preprocessing.ipynb)  | How to use a tokenizer to preprocess your data | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/preprocessing.ipynb) |
-| [Fine-tuning a pretrained model](https://github.com/huggingface/notebooks/blob/master/transformers_doc/training.ipynb)  | How to use the Trainer to fine-tune a pretrained model | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/training.ipynb) |
-| [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/master/transformers_doc/tokenizer_summary.ipynb)  | The differences between the tokenizers algorithm | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/tokenizer_summary.ipynb) |
-| [Multilingual models](https://github.com/huggingface/notebooks/blob/master/transformers_doc/multilingual.ipynb)  | How to use the multilingual models of the library | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/multilingual.ipynb) |
-| [Fine-tuning with custom datasets](https://github.com/huggingface/notebooks/blob/master/transformers_doc/custom_datasets.ipynb)  | How to fine-tune a pretrained model on various tasks | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/custom_datasets.ipynb) |
+| Notebook     |      Description      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [Quicktour of the library](https://github.com/huggingface/notebooks/blob/master/transformers_doc/quicktour.ipynb)  | A presentation of the various APIs in Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/transformers_doc/quicktour.ipynb)| 
+| [Summary of the tasks](https://github.com/huggingface/notebooks/blob/master/transformers_doc/task_summary.ipynb)  | How to run the models of the Transformers library task by task |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/transformers_doc/task_summary.ipynb)| 
+| [Preprocessing data](https://github.com/huggingface/notebooks/blob/master/transformers_doc/preprocessing.ipynb)  | How to use a tokenizer to preprocess your data |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/preprocessing.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/transformers_doc/preprocessing.ipynb)| 
+| [Fine-tuning a pretrained model](https://github.com/huggingface/notebooks/blob/master/transformers_doc/training.ipynb)  | How to use the Trainer to fine-tune a pretrained model |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/transformers_doc/training.ipynb)| 
+| [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/master/transformers_doc/tokenizer_summary.ipynb)  | The differences between the tokenizers algorithm |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/transformers_doc/tokenizer_summary.ipynb)| 
+| [Multilingual models](https://github.com/huggingface/notebooks/blob/master/transformers_doc/multilingual.ipynb)  | How to use the multilingual models of the library |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/transformers_doc/multilingual.ipynb)| 
+| [Fine-tuning with custom datasets](https://github.com/huggingface/notebooks/blob/master/transformers_doc/custom_datasets.ipynb)  | How to fine-tune a pretrained model on various tasks |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/custom_datasets.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/transformers_doc/custom_datasets.ipynb)| 
 
 ### PyTorch Examples
 
-| Notebook     |      Description      |   |
-|:----------|:-------------|------:|
-| [Train your tokenizer](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb) |
-| [Train your language model](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb) |
-| [How to fine-tune a model on text classification](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb)|
-| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)|
-| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb)|
-| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb)|
-| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb)|
-| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/master/examples/translation.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation.ipynb)|
-| [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/master/examples/summarization.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization.ipynb)|
-| [How to fine-tune a speech recognition model in English](https://github.com/huggingface/notebooks/blob/master/examples/speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on TIMIT | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/speech_recognition.ipynb)|
-| [How to fine-tune a speech recognition model in any language](https://github.com/huggingface/notebooks/blob/master/examples/multi_lingual_speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a multi-lingually pretrained speech model on Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multi_lingual_speech_recognition.ipynb)|
-| [How to fine-tune a model on audio classification](https://github.com/huggingface/notebooks/blob/master/examples/audio_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/audio_classification.ipynb)|
-| [How to train a language model from scratch](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)|
-| [How to generate text](https://github.com/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)|
-| [How to export model to ONNX](https://github.com/huggingface/notebooks/blob/master/examples/onnx-export.ipynb) | Highlight how to export and run inference workloads through ONNX |
-| [How to use Benchmarks](https://github.com/huggingface/notebooks/blob/master/examples/benchmark.ipynb) | How to benchmark models with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/benchmark.ipynb)|
-| [Reformer](https://github.com/huggingface/blog/blob/master/notebooks/03_reformer.ipynb) | How Reformer pushes the limits of language modeling | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/master/notebooks/03_reformer.ipynb)|
+| Notebook     |      Description      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [Train your tokenizer](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)| 
+| [Train your language model](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb)   | How to easily start using transformers  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb)| 
+| [How to fine-tune a model on text classification](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb)| 
+| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)| 
+| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb)| 
+| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb)| 
+| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb)| 
+| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/master/examples/translation.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/translation.ipynb)| 
+| [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/master/examples/summarization.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/summarization.ipynb)| 
+| [How to fine-tune a speech recognition model in English](https://github.com/huggingface/notebooks/blob/master/examples/speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on TIMIT | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/speech_recognition.ipynb)| 
+| [How to fine-tune a speech recognition model in any language](https://github.com/huggingface/notebooks/blob/master/examples/multi_lingual_speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a multi-lingually pretrained speech model on Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multi_lingual_speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/multi_lingual_speech_recognition.ipynb)| 
+| [How to fine-tune a model on audio classification](https://github.com/huggingface/notebooks/blob/master/examples/audio_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/audio_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/audio_classification.ipynb)| 
+| [How to train a language model from scratch](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| 
+| [How to generate text](https://github.com/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)| 
+| [How to export model to ONNX](https://github.com/huggingface/notebooks/blob/master/examples/onnx-export.ipynb)| Highlight how to export and run inference workloads through ONNX |
+| [How to use Benchmarks](https://github.com/huggingface/notebooks/blob/master/examples/benchmark.ipynb)| How to benchmark models with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/benchmark.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/benchmark.ipynb)| 
+| [Reformer](https://github.com/huggingface/blog/blob/master/notebooks/03_reformer.ipynb)| How Reformer pushes the limits of language modeling | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/master/notebooks/03_reformer.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/master/notebooks/03_reformer.ipynb)| 
+| [How to fine-tune a model on image classification](https://github.com/huggingface/notebooks/blob/master/examples/image_classification.ipynb) | Show how to preprocess the data and fine-tune any pretrained Vision model on Image Classification | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/image_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/image_classification.ipynb)| 
 
 ### TensorFlow Examples
 
-| Notebook     |      Description      |   |
-|:----------|:-------------|------:|
-| [Train your tokenizer](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb) |
-| [Train your language model](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch-tf.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch-tf.ipynb) |
-| [How to fine-tune a model on text classification](https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)|
-| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling-tf.ipynb)|
-| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/master/examples/token_classification-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification-tf.ipynb)|
-| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/master/examples/question_answering-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering-tf.ipynb)|
-| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/master/examples/multiple_choice-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multiple_choice-tf.ipynb)|
-| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/master/examples/translation-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation-tf.ipynb)|
-| [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/master/examples/summarization-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization-tf.ipynb)|
+| Notebook     |      Description      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [Train your tokenizer](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)| 
+| [Train your language model](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch-tf.ipynb)   | How to easily start using transformers  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch-tf.ipynb)| 
+| [How to fine-tune a model on text classification](https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)| 
+| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/language_modeling-tf.ipynb)| 
+| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/master/examples/token_classification-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/token_classification-tf.ipynb)| 
+| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/master/examples/question_answering-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/question_answering-tf.ipynb)| 
+| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/master/examples/multiple_choice-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multiple_choice-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/multiple_choice-tf.ipynb)| 
+| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/master/examples/translation-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/translation-tf.ipynb)| 
+| [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/master/examples/summarization-tf.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/summarization-tf.ipynb)| 
 
 ### Optimum notebooks
 
 🤗  [Optimum](https://github.com/huggingface/optimum) is an extension of 🤗 Transformers, providing a set of performance optimization tools enabling maximum efficiency to train and run models on targeted hardwares.
 
-| Notebook     |      Description      |   |
-|:----------|:-------------|------:|
-| [How to quantize a model for text classification](https://github.com/huggingface/notebooks/blob/master/examples/text_classification_quantization_inc.ipynb) | Show how to apply [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) quantization on a model for any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification_quantization_inc.ipynb)|
+| Notebook     |      Description      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [How to quantize a model for text classification](https://github.com/huggingface/notebooks/blob/master/examples/text_classification_quantization_inc.ipynb)| Show how to apply [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) quantization on a model for any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification_quantization_inc.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/examples/text_classification_quantization_inc.ipynb)| 
 
 ## Community notebooks:
 
diff --git a/scripts/distributed/torch-distributed-gpu-test.py b/scripts/distributed/torch-distributed-gpu-test.py
new file mode 100755
index 00000000000000..22a99d570e4f85
--- /dev/null
+++ b/scripts/distributed/torch-distributed-gpu-test.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+#
+# This a `torch.distributed` diagnostics script that checks that all GPUs in the cluster (one or
+# many nodes) can talk to each other via nccl and allocate gpu memory.
+#
+# To run first adjust the number of processes and nodes:
+#
+# python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+#
+# You may need to add --master_addr $MASTER_ADDR --master_port $MASTER_PORT if using a custom addr:port
+#
+# You can also use the rdzv API: --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d
+#
+# use torch.distributed.launch instead of torch.distributed.run for torch < 1.9
+#
+# If you get a hanging in `barrier` calls you have some network issues, you may try to debug this with:
+#
+# NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+#
+# which should tell you what's going on behind the scenes.
+#
+#
+# This script can be run via `srun` in the SLURM environment as well. Here is a SLURM script that
+# runs on 2 nodes of 4 gpus per node:
+#
+# #SBATCH --job-name=test-nodes        # name
+# #SBATCH --nodes=2                    # nodes
+# #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+# #SBATCH --cpus-per-task=10           # number of cores per tasks
+# #SBATCH --gres=gpu:4                 # number of gpus
+# #SBATCH --time 0:05:00               # maximum execution time (HH:MM:SS)
+# #SBATCH --output=%x-%j.out           # output file name
+#
+# GPUS_PER_NODE=4
+# MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+# MASTER_PORT=6000
+#
+# srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
+# --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
+# --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
+# torch-distributed-gpu-test.py'
+#
+
+import fcntl
+import os
+import socket
+
+import torch
+import torch.distributed as dist
+
+
+def printflock(*msgs):
+    """solves multi-process interleaved print problem"""
+    with open(__file__, "r") as fh:
+        fcntl.flock(fh, fcntl.LOCK_EX)
+        try:
+            print(*msgs)
+        finally:
+            fcntl.flock(fh, fcntl.LOCK_UN)
+
+
+local_rank = int(os.environ["LOCAL_RANK"])
+torch.cuda.set_device(local_rank)
+device = torch.device("cuda", local_rank)
+hostname = socket.gethostname()
+
+gpu = f"[{hostname}-{local_rank}]"
+
+try:
+    # test distributed
+    dist.init_process_group("nccl")
+    dist.all_reduce(torch.ones(1).to(device), op=dist.ReduceOp.SUM)
+    dist.barrier()
+
+    # test cuda is available and can allocate memory
+    torch.cuda.is_available()
+    torch.ones(1).cuda(local_rank)
+
+    # global rank
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    printflock(f"{gpu} is OK (global rank: {rank}/{world_size})")
+
+    dist.barrier()
+    if rank == 0:
+        printflock(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
+
+except Exception:
+    printflock(f"{gpu} is broken")
+    raise
diff --git a/setup.py b/setup.py
index 587ad724510de1..7395a959925f91 100644
--- a/setup.py
+++ b/setup.py
@@ -149,7 +149,7 @@
     "tf2onnx",
     "timeout-decorator",
     "timm",
-    "tokenizers>=0.10.1,!=0.11.3",
+    "tokenizers>=0.11.1,!=0.11.3",
     "torch>=1.0",
     "torchaudio",
     "pyctcdecode>=0.3.0",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6dd06feb428485..a51be3697e81d8 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -246,7 +246,7 @@
     "models.lxmert": ["LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LxmertConfig", "LxmertTokenizer"],
     "models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
     "models.marian": ["MarianConfig"],
-    "models.maskformer": ["MASK_FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "MaskFormerConfig"],
+    "models.maskformer": ["MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "MaskFormerConfig"],
     "models.mbart": ["MBartConfig"],
     "models.mbart50": [],
     "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
@@ -264,6 +264,7 @@
     "models.pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig", "PegasusTokenizer"],
     "models.perceiver": ["PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PerceiverConfig", "PerceiverTokenizer"],
     "models.phobert": ["PhobertTokenizer"],
+    "models.poolformer": ["POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PoolFormerConfig"],
     "models.prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig", "ProphetNetTokenizer"],
     "models.qdqbert": ["QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "QDQBertConfig"],
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
@@ -525,6 +526,7 @@
     _import_structure["models.layoutxlm"].append("LayoutXLMProcessor")
     _import_structure["models.maskformer"].append("MaskFormerFeatureExtractor")
     _import_structure["models.perceiver"].append("PerceiverFeatureExtractor")
+    _import_structure["models.poolformer"].append("PoolFormerFeatureExtractor")
     _import_structure["models.segformer"].append("SegformerFeatureExtractor")
     _import_structure["models.vilt"].append("ViltFeatureExtractor")
     _import_structure["models.vilt"].append("ViltProcessor")
@@ -646,8 +648,8 @@
     _import_structure["generation_utils"] = ["top_k_top_p_filtering"]
     _import_structure["modeling_outputs"] = []
     _import_structure["modeling_utils"] = ["Conv1D", "PreTrainedModel", "apply_chunking_to_forward", "prune_layer"]
-    # PyTorch models structure
 
+    # PyTorch models structure
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1120,10 +1122,10 @@
     _import_structure["models.marian"].extend(["MarianForCausalLM", "MarianModel", "MarianMTModel"])
     _import_structure["models.maskformer"].extend(
         [
-            "MASK_FORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
             "MaskFormerForInstanceSegmentation",
             "MaskFormerModel",
-            "MaskFormerPreTrainedModel",
+            "MaskFormerPretrainedModel",
         ]
     )
     _import_structure["models.mbart"].extend(
@@ -1224,6 +1226,14 @@
             "PerceiverPreTrainedModel",
         ]
     )
+    _import_structure["models.poolformer"].extend(
+        [
+            "POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "PoolFormerForImageClassification",
+            "PoolFormerModel",
+            "PoolFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.prophetnet"].extend(
         [
             "PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1602,6 +1612,14 @@
     _import_structure["activations_tf"] = []
     _import_structure["benchmark.benchmark_args_tf"] = ["TensorFlowBenchmarkArguments"]
     _import_structure["benchmark.benchmark_tf"] = ["TensorFlowBenchmark"]
+    _import_structure["generation_tf_logits_process"] = [
+        "TFLogitsProcessor",
+        "TFLogitsProcessorList",
+        "TFMinLengthLogitsProcessor",
+        "TFNoBadWordsLogitsProcessor",
+        "TFNoRepeatNGramLogitsProcessor",
+        "TFRepetitionPenaltyLogitsProcessor",
+    ]
     _import_structure["generation_tf_utils"] = ["tf_top_k_top_p_filtering"]
     _import_structure["keras_callbacks"] = ["KerasMetricCallback", "PushToHubCallback"]
     _import_structure["modeling_tf_outputs"] = []
@@ -2056,6 +2074,7 @@
         ]
     )
     _import_structure["optimization_tf"] = ["AdamWeightDecay", "GradientAccumulator", "WarmUp", "create_optimizer"]
+    _import_structure["tf_utils"] = []
     _import_structure["trainer_tf"] = ["TFTrainer"]
 
 else:
@@ -2473,7 +2492,7 @@
     from .models.lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig, LxmertTokenizer
     from .models.m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
     from .models.marian import MarianConfig
-    from .models.maskformer import MASK_FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, MaskFormerConfig
+    from .models.maskformer import MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, MaskFormerConfig
     from .models.mbart import MBartConfig
     from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
     from .models.mmbt import MMBTConfig
@@ -2485,6 +2504,7 @@
     from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer
     from .models.perceiver import PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP, PerceiverConfig, PerceiverTokenizer
     from .models.phobert import PhobertTokenizer
+    from .models.poolformer import POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, PoolFormerConfig
     from .models.prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig, ProphetNetTokenizer
     from .models.qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
@@ -2703,6 +2723,7 @@
         from .models.layoutxlm import LayoutXLMProcessor
         from .models.maskformer import MaskFormerFeatureExtractor
         from .models.perceiver import PerceiverFeatureExtractor
+        from .models.poolformer import PoolFormerFeatureExtractor
         from .models.segformer import SegformerFeatureExtractor
         from .models.vilt import ViltFeatureExtractor, ViltProcessor
         from .models.vit import ViTFeatureExtractor
@@ -3189,10 +3210,10 @@
         )
         from .models.marian import MarianForCausalLM, MarianModel, MarianMTModel
         from .models.maskformer import (
-            MASK_FORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             MaskFormerForInstanceSegmentation,
             MaskFormerModel,
-            MaskFormerPreTrainedModel,
+            MaskFormerPretrainedModel,
         )
         from .models.mbart import (
             MBartForCausalLM,
@@ -3281,6 +3302,12 @@
             PerceiverModel,
             PerceiverPreTrainedModel,
         )
+        from .models.poolformer import (
+            POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PoolFormerForImageClassification,
+            PoolFormerModel,
+            PoolFormerPreTrainedModel,
+        )
         from .models.prophetnet import (
             PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             ProphetNetDecoder,
@@ -3590,6 +3617,14 @@
 
         # Benchmarks
         from .benchmark.benchmark_tf import TensorFlowBenchmark
+        from .generation_tf_logits_process import (
+            TFLogitsProcessor,
+            TFLogitsProcessorList,
+            TFMinLengthLogitsProcessor,
+            TFNoBadWordsLogitsProcessor,
+            TFNoRepeatNGramLogitsProcessor,
+            TFRepetitionPenaltyLogitsProcessor,
+        )
         from .generation_tf_utils import tf_top_k_top_p_filtering
         from .keras_callbacks import KerasMetricCallback, PushToHubCallback
         from .modeling_tf_layoutlm import (
diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index 3d81e8bb1dd6f9..e845e7712e7c45 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -16,7 +16,7 @@
 
 import torch
 from packaging import version
-from torch import nn
+from torch import Tensor, nn
 
 from .utils import logging
 
@@ -24,39 +24,66 @@
 logger = logging.get_logger(__name__)
 
 
-def gelu_python(x):
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+class GELUActivation(nn.Module):
     """
     Original Implementation of the GELU activation function in Google BERT repo when initially created. For
     information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
     torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
     Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if version.parse(torch.__version__) < version.parse("1.4") or use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))
 
-def gelu_new(x):
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class FastGELUActivation(nn.Module):
     """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
     """
-    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 
+    def __init__(self):
+        super().__init__()
 
-if version.parse(torch.__version__) < version.parse("1.4"):
-    gelu = gelu_python
-else:
-    gelu = nn.functional.gelu
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
 
 
-def gelu_fast(x):
-    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
 
+    def __init__(self):
+        super().__init__()
 
-def quick_gelu(x):
-    return x * torch.sigmoid(1.702 * x)
+    def forward(self, input: Tensor) -> Tensor:
+        return input * torch.sigmoid(1.702 * input)
 
 
-def _silu_python(x):
+class SiLUActivation(nn.Module):
     """
     See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
     Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
@@ -64,46 +91,65 @@ def _silu_python(x):
     Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
     later.
     """
-    return x * torch.sigmoid(x)
 
+    def __init__(self):
+        if version.parse(torch.__version__) < version.parse("1.7"):
+            self.act = self._silu_python
+        else:
+            self.act = nn.functional.silu
 
-if version.parse(torch.__version__) < version.parse("1.7"):
-    silu = _silu_python
-else:
-    silu = nn.functional.silu
+    def _silu_python(self, input: Tensor) -> Tensor:
+        return input * torch.sigmoid(input)
 
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
 
-def _mish_python(x):
+
+class MishActivation(nn.Module):
     """
     See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
     visit the official repository for the paper: https://github.com/digantamisra98/Mish
     """
-    return x * torch.tanh(nn.functional.softplus(x))
 
+    def __init__(self):
+        super().__init__()
+        if version.parse(torch.__version__) < version.parse("1.9"):
+            self.act = self._mish_python
+        else:
+            self.act = nn.functional.mish
 
-if version.parse(torch.__version__) < version.parse("1.9"):
-    mish = _mish_python
-else:
-    mish = nn.functional.mish
+    def _mish_python(self, input: Tensor) -> Tensor:
+        return input * torch.tanh(nn.functional.softplus(input))
 
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
 
-def linear_act(x):
-    return x
+
+class LinearActivation(nn.Module):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
 
 
 ACT2FN = {
-    "relu": nn.functional.relu,
-    "silu": silu,
-    "swish": silu,
-    "gelu": gelu,
-    "tanh": torch.tanh,
-    "gelu_python": gelu_python,
-    "gelu_new": gelu_new,
-    "gelu_fast": gelu_fast,
-    "quick_gelu": quick_gelu,
-    "mish": mish,
-    "linear": linear_act,
-    "sigmoid": torch.sigmoid,
+    "relu": nn.ReLU(),
+    "silu": SiLUActivation(),
+    "swish": SiLUActivation(),
+    "gelu": GELUActivation(),
+    "tanh": nn.Tanh(),
+    "gelu_python": GELUActivation(use_gelu_python=True),
+    "gelu_new": NewGELUActivation(),
+    "gelu_fast": FastGELUActivation(),
+    "quick_gelu": QuickGELUActivation(),
+    "mish": MishActivation(),
+    "linear": LinearActivation(),
+    "sigmoid": nn.Sigmoid(),
 }
 
 
@@ -112,3 +158,14 @@ def get_activation(activation_string):
         return ACT2FN[activation_string]
     else:
         raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+
+
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 580de6b91ee553..ba20b00d3e5364 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -580,7 +580,7 @@ def _get_config_dict(
         if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
             config_file = pretrained_model_name_or_path
         else:
-            configuration_file = kwargs.get("_configuration_file", CONFIG_NAME)
+            configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME)
 
             if os.path.isdir(pretrained_model_name_or_path):
                 config_file = os.path.join(pretrained_model_name_or_path, configuration_file)
@@ -602,36 +602,31 @@ def _get_config_dict(
                 user_agent=user_agent,
             )
 
-        except RepositoryNotFoundError as err:
-            logger.error(err)
+        except RepositoryNotFoundError:
             raise EnvironmentError(
                 f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed on "
                 "'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token having "
                 "permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass "
                 "`use_auth_token=True`."
             )
-        except RevisionNotFoundError as err:
-            logger.error(err)
+        except RevisionNotFoundError:
             raise EnvironmentError(
                 f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
                 f"model name. Check the model page at 'https://huggingface.co/{pretrained_model_name_or_path}' for "
                 "available revisions."
             )
-        except EntryNotFoundError as err:
-            logger.error(err)
+        except EntryNotFoundError:
             raise EnvironmentError(
                 f"{pretrained_model_name_or_path} does not appear to have a file named {configuration_file}."
             )
-        except HTTPError as err:
-            logger.error(err)
+        except HTTPError:
             raise EnvironmentError(
                 "We couldn't connect to 'https://huggingface.co/' to load this model and it looks like "
                 f"{pretrained_model_name_or_path} is not the path to a directory conaining a {configuration_file} "
                 "file.\nCheckout your internet connection or see how to run the library in offline mode at "
                 "'https://huggingface.co/docs/transformers/installation#offline-mode'."
             )
-        except EnvironmentError as err:
-            logger.error(err)
+        except EnvironmentError:
             raise EnvironmentError(
                 f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                 "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index d62d8d6701efa3..9c60024f092dec 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -59,7 +59,7 @@
     "tf2onnx": "tf2onnx",
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
-    "tokenizers": "tokenizers>=0.10.1,!=0.11.3",
+    "tokenizers": "tokenizers>=0.11.1,!=0.11.3",
     "torch": "torch>=1.0",
     "torchaudio": "torchaudio",
     "pyctcdecode": "pyctcdecode>=0.3.0",
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 7ce71ac75f914e..91f5bb36a9645c 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -395,8 +395,8 @@ def custom_object_save(obj, folder, config=None):
             "this code in a separate module so we can include it in the saved folder and make it easier to share via "
             "the Hub."
         )
-    # Add object class to the config auto_map
-    if config is not None:
+
+    def _set_auto_map_in_config(_config):
         module_name = obj.__class__.__module__
         last_module = module_name.split(".")[-1]
         full_name = f"{last_module}.{obj.__class__.__name__}"
@@ -418,12 +418,21 @@ def custom_object_save(obj, folder, config=None):
 
             full_name = (slow_tokenizer_class, fast_tokenizer_class)
 
-        if isinstance(config, dict):
-            config["auto_map"] = full_name
-        elif getattr(config, "auto_map", None) is not None:
-            config.auto_map[obj._auto_class] = full_name
+        if isinstance(_config, dict):
+            auto_map = _config.get("auto_map", {})
+            auto_map[obj._auto_class] = full_name
+            _config["auto_map"] = auto_map
+        elif getattr(_config, "auto_map", None) is not None:
+            _config.auto_map[obj._auto_class] = full_name
         else:
-            config.auto_map = {obj._auto_class: full_name}
+            _config.auto_map = {obj._auto_class: full_name}
+
+    # Add object class to the config auto_map
+    if isinstance(config, (list, tuple)):
+        for cfg in config:
+            _set_auto_map_in_config(cfg)
+    elif config is not None:
+        _set_auto_map_in_config(config)
 
     # Copy module file to the output folder.
     object_file = sys.modules[obj.__module__].__file__
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index b65b7cfcd9168c..f4877cd3a6a371 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -26,9 +26,11 @@
 
 from requests import HTTPError
 
+from .dynamic_module_utils import custom_object_save
 from .file_utils import (
     FEATURE_EXTRACTOR_NAME,
     EntryNotFoundError,
+    PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
     TensorType,
@@ -36,6 +38,7 @@
     _is_numpy,
     _is_torch_device,
     cached_path,
+    copy_func,
     hf_bucket_url,
     is_flax_available,
     is_offline_mode,
@@ -199,12 +202,14 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
         return self
 
 
-class FeatureExtractionMixin:
+class FeatureExtractionMixin(PushToHubMixin):
     """
     This is a feature extraction mixin used to provide saving/loading functionality for sequential and image feature
     extractors.
     """
 
+    _auto_class = None
+
     def __init__(self, **kwargs):
         """Set elements of `kwargs` as attributes."""
         # Pop "processor_class" as it should be saved as private attribute
@@ -305,7 +310,7 @@ def from_pretrained(
 
         return cls.from_dict(feature_extractor_dict, **kwargs)
 
-    def save_pretrained(self, save_directory: Union[str, os.PathLike]):
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
         """
         Save a feature_extractor object to the directory `save_directory`, so that it can be re-loaded using the
         [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
@@ -313,15 +318,42 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike]):
         Args:
             save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your feature extractor to the Hugging Face model hub after saving it.
+
+                <Tip warning={true}>
+
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
+                which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
+                folder. Pass along `temp_dir=True` to use a temporary directory instead.
+
+                </Tip>
+
+            kwargs:
+                Additional key word arguments passed along to the [`~file_utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo = self._create_or_get_repo(save_directory, **kwargs)
+
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+
         os.makedirs(save_directory, exist_ok=True)
         # If we save using the predefined names, we can load using `from_pretrained`
         output_feature_extractor_file = os.path.join(save_directory, FEATURE_EXTRACTOR_NAME)
 
         self.to_json_file(output_feature_extractor_file)
-        logger.info(f"Configuration saved in {output_feature_extractor_file}")
+        logger.info(f"Feature extractor saved in {output_feature_extractor_file}")
+
+        if push_to_hub:
+            url = self._push_to_hub(repo, commit_message=commit_message)
+            logger.info(f"Feature extractor pushed to the hub in this commit: {url}")
 
     @classmethod
     def get_feature_extractor_dict(
@@ -380,36 +412,31 @@ def get_feature_extractor_dict(
                 user_agent=user_agent,
             )
 
-        except RepositoryNotFoundError as err:
-            logger.error(err)
+        except RepositoryNotFoundError:
             raise EnvironmentError(
                 f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed on "
                 "'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token having "
                 "permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass "
                 "`use_auth_token=True`."
             )
-        except RevisionNotFoundError as err:
-            logger.error(err)
+        except RevisionNotFoundError:
             raise EnvironmentError(
                 f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
                 f"model name. Check the model page at 'https://huggingface.co/{pretrained_model_name_or_path}' for "
                 "available revisions."
             )
-        except EntryNotFoundError as err:
-            logger.error(err)
+        except EntryNotFoundError:
             raise EnvironmentError(
                 f"{pretrained_model_name_or_path} does not appear to have a file named {FEATURE_EXTRACTOR_NAME}."
             )
-        except HTTPError as err:
-            logger.error(err)
+        except HTTPError:
             raise EnvironmentError(
                 "We couldn't connect to 'https://huggingface.co/' to load this model and it looks like "
                 f"{pretrained_model_name_or_path} is not the path to a directory conaining a "
                 f"{FEATURE_EXTRACTOR_NAME} file.\nCheckout your internet connection or see how to run the library in "
                 "offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'."
             )
-        except EnvironmentError as err:
-            logger.error(err)
+        except EnvironmentError:
             raise EnvironmentError(
                 f"Can't load feature extractor for '{pretrained_model_name_or_path}'. If you were trying to load it "
                 "from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
@@ -539,3 +566,35 @@ def to_json_file(self, json_file_path: Union[str, os.PathLike]):
 
     def __repr__(self):
         return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoFeatureExtractor"):
+        """
+        Register this class with a given auto class. This should only be used for custom feature extractors as the ones
+        in the library are already mapped with `AutoFeatureExtractor`.
+
+        <Tip warning={true}>
+
+        This API is experimental and may have some slight breaking changes in the next releases.
+
+        </Tip>
+
+        Args:
+            auto_class (`str` or `type`, *optional*, defaults to `"AutoFeatureExtractor"`):
+                The auto class to register this new feature extractor with.
+        """
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+
+        import transformers.models.auto as auto_module
+
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+
+        cls._auto_class = auto_class
+
+
+FeatureExtractionMixin.push_to_hub = copy_func(FeatureExtractionMixin.push_to_hub)
+FeatureExtractionMixin.push_to_hub.__doc__ = FeatureExtractionMixin.push_to_hub.__doc__.format(
+    object="feature extractor", object_class="AutoFeatureExtractor", object_files="feature extractor file"
+)
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index abc26cada9bd2f..21194cb6f90e56 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -827,8 +827,10 @@ def requires_backends(obj, backends):
         backends = [backends]
 
     name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not all(BACKENDS_MAPPING[backend][0]() for backend in backends):
-        raise ImportError("".join([BACKENDS_MAPPING[backend][1].format(name) for backend in backends]))
+    checks = (BACKENDS_MAPPING[backend] for backend in backends)
+    failed = [msg.format(name) for available, msg in checks if not available()]
+    if failed:
+        raise ImportError("".join(failed))
 
 
 class DummyObject(type):
@@ -2300,16 +2302,14 @@ def get_file_from_repo(
             use_auth_token=use_auth_token,
         )
 
-    except RepositoryNotFoundError as err:
-        logger.error(err)
+    except RepositoryNotFoundError:
         raise EnvironmentError(
             f"{path_or_repo} is not a local folder and is not a valid model identifier "
             "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
             "pass a token having permission to this repo with `use_auth_token` or log in with "
             "`huggingface-cli login` and pass `use_auth_token=True`."
         )
-    except RevisionNotFoundError as err:
-        logger.error(err)
+    except RevisionNotFoundError:
         raise EnvironmentError(
             f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
             "for this model name. Check the model page at "
diff --git a/src/transformers/generation_flax_logits_process.py b/src/transformers/generation_flax_logits_process.py
index 1d669534134381..76a09ed012dd29 100644
--- a/src/transformers/generation_flax_logits_process.py
+++ b/src/transformers/generation_flax_logits_process.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import inspect
-from abc import ABC
 
 import jax
 import jax.lax as lax
@@ -48,7 +47,7 @@
 """
 
 
-class FlaxLogitsProcessor(ABC):
+class FlaxLogitsProcessor:
     """Abstract base class for all logit processors that can be applied during generation."""
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -59,7 +58,7 @@ def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray) -> jnp.ndarray:
         )
 
 
-class FlaxLogitsWarper(ABC):
+class FlaxLogitsWarper:
     """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index ad79273502e9e7..18f8c5971f5a04 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -15,7 +15,6 @@
 
 import inspect
 import math
-from abc import ABC
 from typing import Callable, Iterable, List, Optional
 
 import numpy as np
@@ -49,7 +48,7 @@
 """
 
 
-class LogitsProcessor(ABC):
+class LogitsProcessor:
     """Abstract base class for all logit processors that can be applied during generation."""
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -60,7 +59,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         )
 
 
-class LogitsWarper(ABC):
+class LogitsWarper:
     """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -380,8 +379,9 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
 
     Args:
         bad_words_ids (`List[List[int]]`):
-            List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
-            that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
+            List of list of token ids that are not allowed to be generated. In order to get the token ids of the words
+            that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
+            add_special_tokens=False).input_ids`.
         eos_token_id (`int`):
             The id of the *end-of-sequence* token.
     """
diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py
new file mode 100644
index 00000000000000..74a61768566743
--- /dev/null
+++ b/src/transformers/generation_tf_logits_process.py
@@ -0,0 +1,295 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List
+
+import numpy as np
+import tensorflow as tf
+
+from .file_utils import add_start_docstrings
+from .tf_utils import set_tensor_by_indices_to_value
+from .utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        scores (`tf.Tensor` of shape `(batch_size, config.vocab_size)`):
+            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+            search or log softmax for each vocabulary token when using beam search
+        kwargs:
+            Additional logits processor specific kwargs.
+
+    Return:
+        `tf.Tensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
+"""
+
+
+class TFLogitsProcessor:
+    """Abstract base class for all logit processors that can be applied during generation."""
+
+    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
+        """TF method for processing logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class TFLogitsProcessorList(list):
+    """
+    This class can be used to create a list of [`TFLogitsProcessor`] to subsequently process a `scores` input tensor.
+    This class inherits from list and adds a specific *__call__* method to apply each [`TFLogitsProcessor`] to the
+    inputs.
+    """
+
+    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, **kwargs) -> tf.Tensor:
+        for processor in self:
+            function_args = inspect.signature(processor.__call__).parameters
+            if len(function_args) > 2:
+                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
+                    raise ValueError(
+                        f"Make sure that all the required parameters: {list(function_args.keys())} for "
+                        f"{processor.__class__} are passed to the logits processor."
+                    )
+                scores = processor(input_ids, scores, **kwargs)
+            else:
+                scores = processor(input_ids, scores)
+        return scores
+
+
+class TFMinLengthLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (`int`):
+            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
+    """
+
+    def __init__(self, min_length: int, eos_token_id: int):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
+        # create boolean flag to decide if min length penalty should be applied
+        cur_len = input_ids.shape[-1]
+        apply_penalty = 1 - tf.clip_by_value(cur_len - self.min_length, 0, 1)
+
+        # TODO(Matt) - this if statement has to be rewritten for XLA. Leaving it now though since
+        # generate is not XLA - compileable anyways
+        if apply_penalty:
+            eos_token_id_mask = tf.broadcast_to(tf.range(scores.shape[-1]) == self.eos_token_id, scores.shape)
+            scores = set_tensor_by_indices_to_value(scores, eos_token_id_mask, float("-inf"))
+
+        return scores
+
+
+class TFRepetitionPenaltyLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] enforcing an exponential penalty on repeated sequences.
+
+    Args:
+        repetition_penalty (`float`):
+            The parameter for repetition penalty. 1.0 means no penalty. See [this
+            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    """
+
+    def __init__(self, penalty: float):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = penalty
+
+    def _create_score_penalties(self, input_ids, logits):
+        # create logit penalties for already seen input_ids
+        token_penalties = np.ones(logits.shape)
+        prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()]
+        for i, prev_input_id in enumerate(prev_input_ids):
+            logit_penalized = logits[i].numpy()[prev_input_id]
+            logit_penalties = np.zeros(logit_penalized.shape)
+            # if previous logit score is < 0 then multiply repetition penalty else divide
+            logit_penalties[logit_penalized < 0] = self.penalty
+            logit_penalties[logit_penalized > 0] = 1 / self.penalty
+            np.put(token_penalties[i], prev_input_id, logit_penalties)
+        return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
+
+        score_penalties = self._create_score_penalties(input_ids, scores)
+
+        scores = tf.math.multiply(scores, score_penalties)
+
+        return scores
+
+
+class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
+    """
+    [`TFLogitsProcessor`] that enforces that specified sequences will never be sampled.
+
+    Args:
+        bad_words_ids (`List[List[int]]`):
+            List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
+            that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
+    """
+
+    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
+
+        if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
+            raise ValueError(f"`bad_words_ids` has to be a non-emtpy list, but is {bad_words_ids}.")
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
+            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+        if any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
+            for bad_word_ids in bad_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+            )
+
+        self.bad_words_ids = bad_words_ids
+
+    def calc_banned_bad_words_ids(self, prev_input_ids):
+        banned_tokens = []
+
+        def _tokens_match(prev_tokens, tokens):
+            if len(tokens) == 0:
+                # if bad word tokens is just one token always ban it
+                return True
+            if len(tokens) > len(prev_tokens):
+                # if bad word tokens are longer than prev tokens they can't be equal
+                return False
+
+            if prev_tokens[-len(tokens) :] == tokens:
+                # if tokens match
+                return True
+            else:
+                return False
+
+        for prev_input_ids_slice in prev_input_ids:
+            banned_tokens_slice = []
+
+            for banned_token_seq in self.bad_words_ids:
+                assert (
+                    len(banned_token_seq) > 0
+                ), f"Banned words token sequences {self.bad_words_ids} cannot have an empty list"
+
+                if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False:
+                    # if tokens do not match continue
+                    continue
+
+                banned_tokens_slice.append(banned_token_seq[-1])
+
+            banned_tokens.append(banned_tokens_slice)
+
+        return banned_tokens
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
+
+        vocab_size = scores.shape[-1]
+
+        # calculate a list of banned tokens according to bad words
+        banned_tokens = self.calc_banned_bad_words_ids(input_ids)
+
+        banned_tokens_indices_mask = []
+        for banned_tokens_slice in banned_tokens:
+            banned_tokens_indices_mask.append(
+                [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+            )
+
+        scores = set_tensor_by_indices_to_value(
+            scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+        )
+
+        return scores
+
+
+class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] that enforces no repetition of n-grams. See
+    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
+
+    Args:
+        ngram_size (`int`):
+            All ngrams of size `ngram_size` can only occur once.
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
+        self.ngram_size = ngram_size
+
+    def calc_banned_ngram_tokens(self, prev_input_ids, num_hypos, cur_len):
+        # Copied from fairseq for no_repeat_ngram in beam_search
+        if cur_len + 1 < self.ngram_size:
+            # return no banned tokens if we haven't generated ngram_size tokens yet
+            return [[] for _ in range(num_hypos)]
+        generated_ngrams = [{} for _ in range(num_hypos)]
+        for idx in range(num_hypos):
+            gen_tokens = prev_input_ids[idx].numpy().tolist()
+            generated_ngram = generated_ngrams[idx]
+            for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]):
+                prev_ngram_tuple = tuple(ngram[:-1])
+                generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+
+        def _get_generated_ngrams(hypo_idx):
+            # Before decoding the next token, prevent decoding of ngrams that have already appeared
+            start_idx = cur_len + 1 - self.ngram_size
+            ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
+            return generated_ngrams[hypo_idx].get(ngram_idx, [])
+
+        banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
+
+        return banned_tokens
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor) -> tf.Tensor:
+
+        batch_size, vocab_size = scores.shape
+        cur_len = input_ids.shape[-1]
+        banned_tokens = self.calc_banned_ngram_tokens(input_ids, batch_size, cur_len)
+
+        # create banned_tokens boolean mask
+        banned_tokens_indices_mask = []
+        for banned_tokens_slice in banned_tokens:
+            banned_tokens_indices_mask.append(
+                [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+            )
+
+        scores = set_tensor_by_indices_to_value(
+            scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+        )
+
+        return scores
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index d10b5817a408a4..b8d4746fe2e666 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -16,12 +16,20 @@
 
 import inspect
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
 
 from .file_utils import ModelOutput
+from .generation_tf_logits_process import (
+    TFLogitsProcessorList,
+    TFMinLengthLogitsProcessor,
+    TFNoBadWordsLogitsProcessor,
+    TFNoRepeatNGramLogitsProcessor,
+    TFRepetitionPenaltyLogitsProcessor,
+)
+from .tf_utils import set_tensor_by_indices_to_value, shape_list
 from .utils import logging
 
 
@@ -476,18 +484,18 @@ def generate(
                 If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                 [`~file_utils.ModelOutput`] types are:
 
-                    - [`~generation_utils.TFGreedySearchDecoderOnlyOutput`],
-                    - [`~generation_utils.TFSampleDecoderOnlyOutput`],
-                    - [`~generation_utils.TFBeamSearchDecoderOnlyOutput`],
-                    - [`~generation_utils.TFBeamSampleDecoderOnlyOutput`]
+                    - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`],
+                    - [`~generation_tf_utils.TFSampleDecoderOnlyOutput`],
+                    - [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`],
+                    - [`~generation_tf_utils.TFBeamSampleDecoderOnlyOutput`]
 
                 If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                 [`~file_utils.ModelOutput`] types are:
 
-                    - [`~generation_utils.TFGreedySearchEncoderDecoderOutput`],
-                    - [`~generation_utils.TFSampleEncoderDecoderOutput`],
-                    - [`~generation_utils.TFBeamSearchEncoderDecoderOutput`],
-                    - [`~generation_utils.TFBeamSampleEncoderDecoderOutput`]
+                    - [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`],
+                    - [`~generation_tf_utils.TFSampleEncoderDecoderOutput`],
+                    - [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`],
+                    - [`~generation_tf_utils.TFBeamSampleEncoderDecoderOutput`]
 
         Examples:
 
@@ -547,6 +555,38 @@ def generate(
             input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids
         )  # generate sequences without allowing bad_words to be generated
         ```"""
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+
+        is_greedy_gen_mode = num_beams == 1 and do_sample is False
+
+        if is_greedy_gen_mode:
+            return self._generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=do_sample,
+                early_stopping=early_stopping,
+                num_beams=num_beams,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                bad_words_ids=bad_words_ids,
+                bos_token_id=bos_token_id,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                length_penalty=length_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                num_return_sequences=num_return_sequences,
+                attention_mask=attention_mask,
+                decoder_start_token_id=decoder_start_token_id,
+                use_cache=use_cache,
+                output_scores=output_scores,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict_in_generate=return_dict_in_generate,
+            )
 
         # We cannot generate if the model does not have a LM head
         if self.get_output_embeddings() is None:
@@ -557,12 +597,11 @@ def generate(
 
         max_length = max_length if max_length is not None else self.config.max_length
         min_length = min_length if min_length is not None else self.config.min_length
-        do_sample = do_sample if do_sample is not None else self.config.do_sample
         early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
-        num_beams = num_beams if num_beams is not None else self.config.num_beams
         temperature = temperature if temperature is not None else self.config.temperature
         top_k = top_k if top_k is not None else self.config.top_k
         top_p = top_p if top_p is not None else self.config.top_p
+
         repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
         bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
         pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
@@ -632,7 +671,7 @@ def generate(
             bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
         ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
 
-        # This block corresponds to the following line in `generation_utils`:
+        # This block corresponds to the following line in `generation_tf_utils`:
         #   "input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))"
         # with the following differences:
         #   1. In PT, `generate()`'s `model_kwargs` can accept `encoder_outputs`, but not the case in TF.
@@ -751,14 +790,13 @@ def generate(
             cur_len < max_length
         ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
 
-        if num_beams > 1:
-            output = self._generate_beam_search(
+        if num_beams == 1:
+            return self._generate_no_beam_search(
                 input_ids,
                 cur_len=cur_len,
                 max_length=max_length,
                 min_length=min_length,
                 do_sample=do_sample,
-                early_stopping=early_stopping,
                 temperature=temperature,
                 top_k=top_k,
                 top_p=top_p,
@@ -768,25 +806,21 @@ def generate(
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
                 batch_size=effective_batch_size,
-                num_return_sequences=num_return_sequences,
-                length_penalty=length_penalty,
-                num_beams=num_beams,
                 vocab_size=vocab_size,
                 encoder_outputs=encoder_outputs,
                 attention_mask=attention_mask,
                 use_cache=use_cache,
-                forced_bos_token_id=forced_bos_token_id,
-                forced_eos_token_id=forced_eos_token_id,
                 return_dict_in_generate=return_dict_in_generate,
                 **model_kwargs,
             )
         else:
-            output = self._generate_no_beam_search(
+            return self._generate_beam_search(
                 input_ids,
                 cur_len=cur_len,
                 max_length=max_length,
                 min_length=min_length,
                 do_sample=do_sample,
+                early_stopping=early_stopping,
                 temperature=temperature,
                 top_k=top_k,
                 top_p=top_p,
@@ -796,16 +830,19 @@ def generate(
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
                 batch_size=effective_batch_size,
+                num_return_sequences=num_return_sequences,
+                length_penalty=length_penalty,
+                num_beams=num_beams,
                 vocab_size=vocab_size,
                 encoder_outputs=encoder_outputs,
                 attention_mask=attention_mask,
                 use_cache=use_cache,
+                forced_bos_token_id=forced_bos_token_id,
+                forced_eos_token_id=forced_eos_token_id,
                 return_dict_in_generate=return_dict_in_generate,
                 **model_kwargs,
             )
 
-        return output
-
     def _generate_no_beam_search(
         self,
         input_ids,
@@ -1488,6 +1525,676 @@ def adjust_logits_during_generation(
         else:
             return logits
 
+    def _generate(
+        self,
+        input_ids=None,
+        max_length=None,
+        min_length=None,
+        do_sample=None,
+        early_stopping=None,
+        num_beams=None,
+        temperature=None,
+        top_k=None,
+        top_p=None,
+        repetition_penalty=None,
+        bad_words_ids=None,
+        bos_token_id=None,
+        pad_token_id=None,
+        eos_token_id=None,
+        length_penalty=None,
+        no_repeat_ngram_size=None,
+        num_return_sequences=None,
+        attention_mask=None,
+        decoder_start_token_id=None,
+        use_cache=None,
+        output_scores=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict_in_generate=None,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        **model_kwargs,
+    ) -> Union[TFGreedySearchOutput, TFSampleOutput, TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]:
+        r"""
+        Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
+        beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.
+
+        Adapted in part from [Facebook's XLM beam search
+        code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
+
+        Apart from `input_ids` and `attention_mask`, all the arguments below will default to the value of the attribute
+        of the same name inside the [`PretrainedConfig`] of the model. The default values indicated are the default
+        values of those config.
+
+        Most of these parameters are explained in more detail in [this blog
+        post](https://huggingface.co/blog/how-to-generate).
+
+        Parameters:
+
+            input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
+                The sequence used as a prompt for the generation. If `None` the method initializes it with
+                `bos_token_id` and a batch size of 1.
+            max_length (`int`, *optional*, defaults to 20):
+                The maximum length of the sequence to be generated.
+            min_length (`int`, *optional*, defaults to 10):
+                The minimum length of the sequence to be generated.
+            do_sample (`bool`, *optional*, defaults to `False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            early_stopping (`bool`, *optional*, defaults to `False`):
+                Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+            num_beams (`int`, *optional*, defaults to 1):
+                Number of beams for beam search. 1 means no beam search.
+            temperature (`float`, *optional*, defaults to 1.0):
+                The value used to module the next token probabilities.
+            top_k (`int`, *optional*, defaults to 50):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`, *optional*, defaults to 1.0):
+                If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher
+                are kept for generation.
+            repetition_penalty (`float`, *optional*, defaults to 1.0):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            bos_token_id (`int`, *optional*):
+                The id of the *beginning-of-sequence* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            length_penalty (`float`, *optional*, defaults to 1.0):
+                Exponential penalty to the length. 1.0 means no penalty.
+
+                Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
+                order to encourage the model to produce longer sequences.
+            no_repeat_ngram_size (`int`, *optional*, defaults to 0):
+                If set to int > 0, all ngrams of that size can only occur once.
+            bad_words_ids(`List[int]`, *optional*):
+                List of token ids that are not allowed to be generated. In order to get the tokens of the words that
+                should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+            num_return_sequences(`int`, *optional*, defaults to 1):
+                The number of independently computed returned sequences for each element in the batch.
+            attention_mask (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens
+                that are not masked, and 0 for masked tokens.
+
+                If not provided, will default to a tensor the same shape as `input_ids` that masks the pad token.
+
+                [What are attention masks?](../glossary#attention-mask)
+            decoder_start_token_id (`int`, *optional*):
+                If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+            use_cache (`bool`, *optional*, defaults to `True`):
+                Whether or not the model should use the past last key/values attentions (if applicable to the model) to
+                speed up decoding.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            forced_bos_token_id (`int`, *optional*):
+                The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful
+                for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be
+                the target language token.
+            forced_eos_token_id (`int`, *optional*):
+                The id of the token to force as the last generated token when `max_length` is reached.
+            model_specific_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model.
+
+        Return:
+            [`~file_utils.ModelOutput`] or `tf.Tensor`: A [`~file_utils.ModelOutput`] (if
+            `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `tf.Tensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~file_utils.ModelOutput`] types are:
+
+                    - [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`],
+                    - [`~generation_tf_utils.TFSampleDecoderOnlyOutput`],
+                    - [`~generation_tf_utils.TFBeamSearchDecoderOnlyOutput`],
+                    - [`~generation_tf_utils.TFBeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~file_utils.ModelOutput`] types are:
+
+                    - [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`],
+                    - [`~generation_tf_utils.TFSampleEncoderDecoderOutput`],
+                    - [`~generation_tf_utils.TFBeamSearchEncoderDecoderOutput`],
+                    - [`~generation_tf_utils.TFBeamSampleEncoderDecoderOutput`]
+
+        Examples:
+
+        ```python
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")  # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained("distilgpt2")
+        # Greedy decoding
+        outputs = model.generate(max_length=40)
+        print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
+
+        tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
+        model = TFAutoModelWithLMHead.from_pretrained("openai-gpt")
+        input_context = "The dog"
+        input_ids = tokenizer.encode(input_context, return_tensors="tf")  # encode input context
+        # Generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+        outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)
+        # 3 output sequences were generated
+        for i in range(3):
+            print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}")
+
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+        model = TFAutoModelWithLMHead.from_pretrained("distilgpt2")
+        input_context = "The dog"
+        input_ids = tokenizer.encode(input_context, return_tensors="tf")
+        # Generate 3 candidates using sampling
+        outputs = model.generate(
+            input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True
+        )
+        #  3 output sequences were generated
+        for i in range(3):
+            print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}")
+
+        tokenizer = AutoTokenizer.from_pretrained("ctrl")
+        model = TFAutoModelWithLMHead.from_pretrained("ctrl")
+        # "Legal" is one of the control codes for ctrl
+        input_context = "Legal My neighbor is"
+        input_ids = tokenizer.encode(input_context, return_tensors="tf")
+        outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)
+        print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
+
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+        input_context = "My cute dog"
+        bad_words_ids = [
+            tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ["idiot", "stupid", "shut up"]
+        ]
+        input_ids = tokenizer.encode(input_context, return_tensors="tf")
+        # generate sequences without allowing bad_words to be generated
+        outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)
+        ```"""
+        # 1. Set generation parameters if not already defined
+        max_length = max_length if max_length is not None else self.config.max_length
+        min_length = min_length if min_length is not None else self.config.min_length
+        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
+
+        if pad_token_id is None and eos_token_id is not None:
+            logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence")
+            pad_token_id = eos_token_id
+
+        # 2. Define model inputs
+        input_ids = self._prepare_model_inputs(input_ids, bos_token_id)
+        # inputs_ids now has to be defined and cannot be None anymore
+        batch_size = input_ids.shape[0]
+
+        # 3. Prepare other model kwargs
+        model_kwargs["output_attentions"] = output_attentions
+        model_kwargs["output_hidden_states"] = output_hidden_states
+        model_kwargs["use_cache"] = use_cache
+
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(input_ids, pad_token_id)
+
+        if self.config.is_encoder_decoder:
+            # if model is encoder decoder model, we create encoder_outputs and add to `model_kwargs`
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                input_ids, return_dict_in_generate, model_kwargs
+            )
+
+        # TODO(Patrick) - ugly `past`/`encoder_output` hack here which requires a bigger
+        # refactor of all generation models in TF. `past` should be
+        # optional everywhere and not be set equal to encoder_outputs
+        model_kwargs["past"] = model_kwargs.get("encoder_outputs")[:1] if self.config.is_encoder_decoder else None
+
+        # 4. Prepare `input_ids` which will be used for auto-regressive generation
+        if self.config.is_encoder_decoder:
+            # if encoder-decoder then `input_ids` come from `decoder_start_token_id`
+            input_ids = self._prepare_decoder_input_ids_for_generation(
+                batch_size,
+                decoder_start_token_id=decoder_start_token_id,
+                bos_token_id=bos_token_id,
+                model_kwargs=model_kwargs,
+            )
+
+        if input_ids.shape[-1] >= max_length:
+            raise ValueError(
+                f"The context has {input_ids.shape[-1]} number of tokens, "
+                f"but `max_length` is only {max_length}. "
+                "Please make sure that `max_length` is bigger than the number of tokens, "
+                "by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
+            )
+
+        # 5. determine generation mode
+        # TODO(Matt, Joao, Patrick) - add more use cases here
+        is_greedy_gen_mode = (num_beams == 1) and do_sample is False
+
+        # 6. prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            repetition_penalty=repetition_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            bad_words_ids=bad_words_ids,
+            min_length=min_length,
+            eos_token_id=eos_token_id,
+        )
+
+        # 7. go into different generation modes
+        if is_greedy_gen_mode:
+            if num_return_sequences > 1:
+                raise ValueError(
+                    f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search."
+                )
+
+            # 8. run greedy search
+            return self.greedy_search(
+                input_ids,
+                max_length=max_length,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                logits_processor=logits_processor,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                **model_kwargs,
+            )
+
+        # TODO(Matt, Joao, Patrick) - add more sub-generation methods here
+
+    def _prepare_attention_mask_for_generation(
+        self,
+        input_ids: tf.Tensor,
+        pad_token_id: int,
+    ) -> tf.Tensor:
+        # prepare `attention_mask` if not passed
+        if (pad_token_id is not None) and (pad_token_id in input_ids.numpy()):
+            return tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32)
+        else:
+            return tf.ones(input_ids.shape[:2], dtype=tf.int32)
+
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self, input_ids: tf.Tensor, return_dict_in_generate, model_kwargs
+    ) -> Dict[str, Any]:
+        # TODO(Patrick) - remove `return_dict_in_generate` flag input once `past`/`encoder_outputs`
+        # is cleaned
+
+        # get encoder and store encoder outputs
+        encoder = self.get_encoder()
+
+        # prepare encoder args and encoder kwargs from model kwargs
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+
+        # vision models don't use `attention_mask`.
+        signature = dict(inspect.signature(encoder.call).parameters)
+        if "attention_mask" not in signature:
+            encoder_kwargs.pop("attention_mask")
+
+        encoder_outputs = encoder(input_ids, **encoder_kwargs)
+
+        model_kwargs["encoder_outputs"] = encoder_outputs
+
+        # TODO(Patrick): `encoder_outputs`, `past` hack. Currently, `encoder_attentions` and
+        # `encoder_hidden_states` have to be seperated from encoder_outputs and passed
+        # under other names because of `encoder_outputs`, `past` hack. Need to clean-up
+        # all encoder-decoder prepare_inputs_for_generation method to clean this
+        if return_dict_in_generate:
+            model_kwargs["encoder_attentions"] = encoder_outputs.get("attentions", None)
+            model_kwargs["encoder_hidden_states"] = encoder_outputs.get("hidden_states", None)
+
+        return model_kwargs
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
+    ) -> tf.Tensor:
+
+        # prepare `input_ids` for decoder if model is encoder-decoder
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            return model_kwargs.pop("decoder_input_ids")
+        else:
+            decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+            return tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id
+
+    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+        # retrieve decoder_start_token_id for encoder-decoder models
+        # fall back to bos_token_id if necessary
+        decoder_start_token_id = (
+            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+        )
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif (
+            hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "decoder_start_token_id")
+            and self.config.decoder.decoder_start_token_id is not None
+        ):
+            return self.config.decoder.decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        elif (
+            hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "bos_token_id")
+            and self.config.decoder.bos_token_id is not None
+        ):
+            return self.config.decoder.bos_token_id
+        raise ValueError(
+            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
+        )
+
+    def _prepare_model_inputs(self, inputs: Optional[tf.Tensor] = None, bos_token_id: Optional[int] = None):
+        # TODO(Patrick) - adapt this function when making `generate` more flexible
+        # for all kinds of input types
+        if inputs is None:
+            # if no `inputs` are passed create prompt of size (1,1) filled with BOS token
+            if not isinstance(bos_token_id, int) or bos_token_id < 0:
+                raise ValueError(
+                    "you should either supply a context to complete as `input_ids` input "
+                    "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
+                )
+            return tf.cast(tf.fill((1, 1), bos_token_id), dtype=tf.int32)
+
+        return inputs
+
+    def _update_model_kwargs_for_generation(
+        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False
+    ) -> Dict[str, Any]:
+        # update past
+        if self._use_cache(outputs, model_kwargs["use_cache"]):
+            # TODO(Patrick): `past`/`encoder_outputs` hack. This should be
+            # removed when cleaning up the encoder-decoder models
+            # if model has past, then set the past variable to speed up decoding
+            # make this method static then as well
+            model_kwargs["past"] = outputs[1]
+        elif "past_key_values" in outputs:
+            model_kwargs["past"] = outputs.past_key_values
+        elif "mems" in outputs:
+            model_kwargs["past"] = outputs.mems
+        elif "past_buckets_states" in outputs:
+            model_kwargs["past"] = outputs.past_buckets_states
+        elif "past" in model_kwargs:
+            # TODO(Patrick) `past`/`encoder_outputs` hack.
+            # removed when cleaning up the encoder-decoder models.
+            # The line should not be necessary.
+            pass
+        else:
+            model_kwargs["past"] = None
+
+        # update attention mask
+        if not is_encoder_decoder:
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = tf.concat(
+                    [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
+                )
+
+        return model_kwargs
+
+    def _get_logits_processor(
+        self,
+        repetition_penalty: float,
+        no_repeat_ngram_size: int,
+        bad_words_ids: List[List[int]],
+        min_length: int,
+        eos_token_id: int,
+    ) -> TFLogitsProcessorList:
+        """
+        This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`]
+        instances used to modify the scores of the language model head.
+        """
+        processors = TFLogitsProcessorList()
+
+        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+        no_repeat_ngram_size = (
+            no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
+        )
+        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+
+        # instantiate processors list
+        if repetition_penalty is not None and repetition_penalty != 1.0:
+            processors.append(TFRepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0:
+            processors.append(TFNoRepeatNGramLogitsProcessor(no_repeat_ngram_size))
+        if bad_words_ids is not None:
+            processors.append(TFNoBadWordsLogitsProcessor(bad_words_ids, eos_token_id))
+        if min_length is not None and eos_token_id is not None and min_length > -1:
+            processors.append(TFMinLengthLogitsProcessor(min_length, eos_token_id))
+
+        return processors
+
+    def greedy_search(
+        self,
+        input_ids: tf.Tensor,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[TFGreedySearchOutput, tf.Tensor]:
+        r"""
+        Generates sequences for models with a language modeling head using greedy decoding.
+
+        Parameters:
+
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`TFLogitsProcessorList`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`],
+            [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`] or `tf.Tensor`: A `tf.Tensor` containing the
+            generated tokens (default behaviour) or a [`~generation_tf_utils.TFGreedySearchDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`~generation_tf_utils.TFGreedySearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     TFAutoTokenizer,
+        ...     TFAutoModelForCausalLM,
+        ...     TFLogitsProcessorList,
+        ...     TFMinLengthLogitsProcessor,
+        ... )
+
+        >>> tokenizer = TFAutoTokenizer.from_pretrained("gpt2")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+        >>> model.config.pad_token_id = model.config.eos_token_id
+
+        >>> input_prompt = "Today is a beautiful day, and"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="tf").input_ids
+
+        >>> # instantiate logits processors
+        >>> logits_processor = TFLogitsProcessorList(
+        ...     [
+        ...         TFMinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+        ...     ]
+        ... )
+
+        >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
+
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList()
+
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # TODO(Patrick): `encoder_outputs`, `past` hack. Currently T5, Bart expect `encoder_outputs`
+        # to be wrapped into `past` variable. Tis is a bad design and needs
+        # to be updated.
+        # Remove the following lines when updating all encoder-decoder models
+        encoder_outputs = model_kwargs.pop("encoder_outputs", None)
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = encoder_outputs.get("attentions") if output_attentions else None
+            encoder_hidden_states = encoder_outputs.get("hidden_states") if output_hidden_states else None
+
+        # keep track of which sequences are already finished
+        unfinished_sequences = tf.ones_like(input_ids[:, 0])
+        cur_len = input_ids.shape[-1]
+
+        while cur_len < max_length:
+            # TODO(Patrick): remove following line by cleaning up `prepare_inputs_for_generation`
+            # in all models
+            model_kwargs["use_cache"] = None if "use_cache" not in model_kwargs else model_kwargs["use_cache"]
+
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # pre-process distribution
+            next_tokens_scores = logits_processor(input_ids, next_token_logits)
+
+            # argmax
+            next_tokens = tf.cast(tf.argmax(next_tokens_scores, axis=-1), tf.int32)
+
+            # finished sentences should have their next token be a padding token
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = tf.concat([input_ids, next_tokens[:, None]], axis=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            cur_len = cur_len + 1
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id is not None:
+                eos_in_sents = next_tokens == eos_token_id
+                # if sentence is unfinished and the token to add is eos
+                is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply(
+                    unfinished_sequences, tf.cast(eos_in_sents, tf.int32)
+                )
+
+                # unfinished_sequences is set to zero if eos in sentence
+                unfinished_sequences -= is_sents_unfinished_and_token_to_add_is_eos
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if tf.math.reduce_max(unfinished_sequences) == 0:
+                break
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return TFGreedySearchEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return TFGreedySearchDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return input_ids
+
 
 def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty):
     # create logit penalties for already seen input_ids
@@ -1628,12 +2335,6 @@ def scatter_values_on_batch_indices(values, batch_indices):
     return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape)
 
 
-def set_tensor_by_indices_to_value(tensor, indices, value):
-    # create value_tensor since tensor value assignment is not possible in TF
-    value_tensor = tf.zeros_like(tensor) + value
-    return tf.where(indices, value_tensor, tensor)
-
-
 def sample_without_replacement(logits, num_samples):
     """
     categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see
@@ -1644,13 +2345,6 @@ def sample_without_replacement(logits, num_samples):
     return indices
 
 
-def shape_list(x):
-    """Deal with dynamic shape in tensorflow cleanly."""
-    static = x.shape.as_list()
-    dynamic = tf.shape(x)
-    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
-
-
 class BeamHypotheses(object):
     def __init__(self, num_beams, max_length, length_penalty, early_stopping):
         """
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 29fdf4b216d9e6..6700663afff5e1 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -901,8 +901,8 @@ def generate(
                 If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
                 `decoder_input_ids`.
             bad_words_ids(`List[List[int]]`, *optional*):
-                List of token ids that are not allowed to be generated. In order to get the tokens of the words that
-                should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True,
+                List of token ids that are not allowed to be generated. In order to get the token ids of the words that
+                should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
                 add_special_tokens=False).input_ids`.
             num_return_sequences(`int`, *optional*, defaults to 1):
                 The number of independently computed returned sequences for each element in the batch.
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 969fd6daf22c5d..1d7c82421043de 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -492,23 +492,20 @@ def from_pretrained(
                     user_agent=user_agent,
                 )
 
-            except RepositoryNotFoundError as err:
-                logger.error(err)
+            except RepositoryNotFoundError:
                 raise EnvironmentError(
                     f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
                     "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
                     "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
                     "login` and pass `use_auth_token=True`."
                 )
-            except RevisionNotFoundError as err:
-                logger.error(err)
+            except RevisionNotFoundError:
                 raise EnvironmentError(
                     f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
                     "this model name. Check the model page at "
                     f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
                 )
-            except EntryNotFoundError as err:
-                logger.error(err)
+            except EntryNotFoundError:
                 if filename == FLAX_WEIGHTS_NAME:
                     has_file_kwargs = {"revision": revision, "proxies": proxies, "use_auth_token": use_auth_token}
                     if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
@@ -518,7 +515,6 @@ def from_pretrained(
                             "those weights."
                         )
                     else:
-                        logger.error(err)
                         raise EnvironmentError(
                             f"{pretrained_model_name_or_path} does not appear to have a file named {FLAX_WEIGHTS_NAME} "
                             f"or {WEIGHTS_NAME}."
@@ -527,8 +523,7 @@ def from_pretrained(
                     raise EnvironmentError(
                         f"{pretrained_model_name_or_path} does not appear to have a file named {filename}."
                     )
-            except HTTPError as err:
-                logger.error(err)
+            except HTTPError:
                 raise EnvironmentError(
                     "We couldn't connect to 'https://huggingface.co/' to load this model and it looks like "
                     f"{pretrained_model_name_or_path} is not the path to a directory conaining a a file named "
@@ -536,8 +531,7 @@ def from_pretrained(
                     "Checkout your internet connection or see how to run the library in offline mode at "
                     "'https://huggingface.co/docs/transformers/installation#offline-mode'."
                 )
-            except EnvironmentError as err:
-                logger.error(err)
+            except EnvironmentError:
                 raise EnvironmentError(
                     f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                     "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 5e37a1818df853..de2c61ae4c016a 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -34,6 +34,7 @@
 from huggingface_hub import Repository, list_repo_files
 from requests import HTTPError
 
+from .activations_tf import get_tf_activation
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
 from .file_utils import (
@@ -54,6 +55,7 @@
 )
 from .generation_tf_utils import TFGenerationMixin
 from .modeling_tf_outputs import TFSeq2SeqLMOutput
+from .tf_utils import shape_list
 from .tokenization_utils_base import BatchEncoding
 from .utils import logging
 
@@ -1135,6 +1137,11 @@ def resize_token_embeddings(self, new_num_tokens=None) -> tf.Variable:
         return model_embeds
 
     def _get_word_embedding_weight(model, embedding_layer):
+        # If the variable holds the weights themselves, return them
+        if isinstance(embedding_layer, tf.Tensor):
+            return embedding_layer
+        # Otherwise, try to get them from the layer's attributes
+
         embeds = getattr(embedding_layer, "weight", None)
         if embeds is not None:
             return embeds
@@ -1583,23 +1590,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     user_agent=user_agent,
                 )
 
-            except RepositoryNotFoundError as err:
-                logger.error(err)
+            except RepositoryNotFoundError:
                 raise EnvironmentError(
                     f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
                     "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
                     "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
                     "login` and pass `use_auth_token=True`."
                 )
-            except RevisionNotFoundError as err:
-                logger.error(err)
+            except RevisionNotFoundError:
                 raise EnvironmentError(
                     f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
                     "this model name. Check the model page at "
                     f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
                 )
-            except EntryNotFoundError as err:
-                logger.error(err)
+            except EntryNotFoundError:
                 if filename == TF2_WEIGHTS_NAME:
                     has_file_kwargs = {
                         "revision": revision,
@@ -1614,7 +1618,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                             "those weights."
                         )
                     else:
-                        logger.error(err)
                         raise EnvironmentError(
                             f"{pretrained_model_name_or_path} does not appear to have a file named {TF2_WEIGHTS_NAME} "
                             f"or {WEIGHTS_NAME}."
@@ -1623,8 +1626,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     raise EnvironmentError(
                         f"{pretrained_model_name_or_path} does not appear to have a file named {filename}."
                     )
-            except HTTPError as err:
-                logger.error(err)
+            except HTTPError:
                 raise EnvironmentError(
                     "We couldn't connect to 'https://huggingface.co/' to load this model and it looks like "
                     f"{pretrained_model_name_or_path} is not the path to a directory conaining a a file named "
@@ -1632,8 +1634,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     "Checkout your internet connection or see how to run the library in offline mode at "
                     "'https://huggingface.co/docs/transformers/installation#offline-mode'."
                 )
-            except EnvironmentError as err:
-                logger.error(err)
+            except EnvironmentError:
                 raise EnvironmentError(
                     f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                     "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
@@ -1952,9 +1953,11 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **
                 num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
             )
 
-        self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh"
-        if self.has_activation:
-            self.activation = tf.keras.activations.tanh
+        self.has_activation = False
+        activation_string = getattr(config, "summary_activation", None)
+        if activation_string is not None:
+            self.has_activation = True
+            self.activation = get_tf_activation(activation_string)
 
         self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
         if self.has_first_dropout:
@@ -2042,29 +2045,6 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"):
         cls._auto_class = auto_class
 
 
-def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]:
-    """
-    Deal with dynamic shape in tensorflow cleanly.
-
-    Args:
-        tensor (`tf.Tensor` or `np.ndarray`): The tensor we want the shape of.
-
-    Returns:
-        `List[int]`: The shape of the tensor as a list.
-    """
-    if isinstance(tensor, np.ndarray):
-        return list(tensor.shape)
-
-    dynamic = tf.shape(tensor)
-
-    if tensor.shape == tf.TensorShape(None):
-        return dynamic
-
-    static = tensor.shape.as_list()
-
-    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
-
-
 def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
     """
     Creates a `tf.initializers.TruncatedNormal` with the given range.
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index bc93e0616bb460..10a313065b4b4e 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1366,23 +1366,20 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     user_agent=user_agent,
                 )
 
-            except RepositoryNotFoundError as err:
-                logger.error(err)
+            except RepositoryNotFoundError:
                 raise EnvironmentError(
                     f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
                     "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
                     "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
                     "login` and pass `use_auth_token=True`."
                 )
-            except RevisionNotFoundError as err:
-                logger.error(err)
+            except RevisionNotFoundError:
                 raise EnvironmentError(
                     f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
                     "this model name. Check the model page at "
                     f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
                 )
-            except EntryNotFoundError as err:
-                logger.error(err)
+            except EntryNotFoundError:
                 if filename == WEIGHTS_NAME:
                     has_file_kwargs = {
                         "revision": revision,
@@ -1403,7 +1400,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             "weights."
                         )
                     else:
-                        logger.error(err)
                         raise EnvironmentError(
                             f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME}, "
                             f"{TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
@@ -1412,8 +1408,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     raise EnvironmentError(
                         f"{pretrained_model_name_or_path} does not appear to have a file named {filename}."
                     )
-            except HTTPError as err:
-                logger.error(err)
+            except HTTPError:
                 raise EnvironmentError(
                     "We couldn't connect to 'https://huggingface.co/' to load this model and it looks like "
                     f"{pretrained_model_name_or_path} is not the path to a directory conaining a a file named "
@@ -1421,8 +1416,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     "Checkout your internet connection or see how to run the library in offline mode at "
                     "'https://huggingface.co/docs/transformers/installation#offline-mode'."
                 )
-            except EnvironmentError as err:
-                logger.error(err)
+            except EnvironmentError:
                 raise EnvironmentError(
                     f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                     "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 6cdaac155d47b6..854e5e21bb5716 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -84,6 +84,7 @@
     pegasus,
     perceiver,
     phobert,
+    poolformer,
     prophetnet,
     qdqbert,
     rag,
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index f2659e817a958c..42f1e5b34dff5f 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -51,8 +51,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_albert import AlbertConfig
 
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index eeffe624ca0edb..b274ebec4201e5 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -31,6 +31,7 @@
     [
         # Add configs here
         ("maskformer", "MaskFormerConfig"),
+        ("poolformer", "PoolFormerConfig"),
         ("convnext", "ConvNextConfig"),
         ("yoso", "YosoConfig"),
         ("swin", "SwinConfig"),
@@ -127,6 +128,7 @@
     [
         # Add archive maps here
         ("maskformer", "MASK_FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("poolformer", "POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("convnext", "CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("yoso", "YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swin", "SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -210,6 +212,7 @@
     [
         # Add full (and cased) model names here
         ("maskformer", "MaskFormer"),
+        ("poolformer", "PoolFormer"),
         ("convnext", "ConvNext"),
         ("yoso", "YOSO"),
         ("swin", "Swin"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index a146c611fb9633..ed0e58e0081a14 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -14,23 +14,28 @@
 # limitations under the License.
 """ AutoFeatureExtractor class."""
 import importlib
+import json
 import os
 from collections import OrderedDict
+from typing import Dict, Optional, Union
 
 # Build the list of all feature extractors
 from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module
 from ...feature_extraction_utils import FeatureExtractionMixin
-from ...file_utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME
+from ...file_utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo
+from ...utils import logging
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
     CONFIG_MAPPING_NAMES,
     AutoConfig,
-    config_class_to_model_type,
     model_type_to_module_name,
     replace_list_option_in_docstrings,
 )
 
 
+logger = logging.get_logger(__name__)
+
 FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
     [
         ("beit", "BeitFeatureExtractor"),
@@ -48,6 +53,7 @@
         ("vit_mae", "ViTFeatureExtractor"),
         ("segformer", "SegformerFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
+        ("poolformer", "PoolFormerFeatureExtractor"),
     ]
 )
 
@@ -63,9 +69,103 @@ def feature_extractor_class_from_name(class_name: str):
             return getattr(module, class_name)
             break
 
+    for config, extractor in FEATURE_EXTRACTOR_MAPPING._extra_content.items():
+        if getattr(extractor, "__name__", None) == class_name:
+            return extractor
+
     return None
 
 
+def get_feature_extractor_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
+    """
+    Loads the tokenizer configuration from a pretrained model tokenizer configuration.
+
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision(`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+
+    <Tip>
+
+    Passing `use_auth_token=True` is required when you want to use a private model.
+
+    </Tip>
+
+    Returns:
+        `Dict`: The configuration of the tokenizer.
+
+    Examples:
+
+    ```python
+    # Download configuration from huggingface.co and cache.
+    tokenizer_config = get_tokenizer_config("bert-base-uncased")
+    # This model does not have a tokenizer config so the result will be an empty dict.
+    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
+
+    # Save a pretrained tokenizer locally and you can reload its config
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    tokenizer.save_pretrained("tokenizer-test")
+    tokenizer_config = get_tokenizer_config("tokenizer-test")
+    ```"""
+    resolved_config_file = get_file_from_repo(
+        pretrained_model_name_or_path,
+        FEATURE_EXTRACTOR_NAME,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        use_auth_token=use_auth_token,
+        revision=revision,
+        local_files_only=local_files_only,
+    )
+    if resolved_config_file is None:
+        logger.info(
+            "Could not locate the feature extractor configuration file, will try to use the model config instead."
+        )
+        return {}
+
+    with open(resolved_config_file, encoding="utf-8") as reader:
+        return json.load(reader)
+
+
 class AutoFeatureExtractor:
     r"""
     This is a generic feature extractor class that will be instantiated as one of the feature extractor classes of the
@@ -128,6 +228,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
                 consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
                 `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
             kwargs (`Dict[str, Any]`, *optional*):
                 The values in kwargs of any keys which are feature extractor attributes will be used to override the
                 loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
@@ -151,35 +255,66 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
         ```"""
         config = kwargs.pop("config", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", False)
         kwargs["_from_auto"] = True
 
-        is_feature_extraction_file = os.path.isfile(pretrained_model_name_or_path)
-        is_directory = os.path.isdir(pretrained_model_name_or_path) and os.path.exists(
-            os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME)
-        )
-
-        has_local_config = (
-            os.path.exists(os.path.join(pretrained_model_name_or_path, CONFIG_NAME)) if is_directory else False
-        )
+        config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
+        feature_extractor_class = config_dict.get("feature_extractor_type", None)
+        feature_extractor_auto_map = None
+        if "AutoFeatureExtractor" in config_dict.get("auto_map", {}):
+            feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"]
 
-        # load config, if it can be loaded
-        if not is_feature_extraction_file and (has_local_config or not is_directory):
+        # If we don't find the feature extractor class in the feature extractor config, let's try the model config.
+        if feature_extractor_class is None and feature_extractor_auto_map is None:
             if not isinstance(config, PretrainedConfig):
                 config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            # It could be in `config.feature_extractor_type``
+            feature_extractor_class = getattr(config, "feature_extractor_type", None)
+            if hasattr(config, "auto_map") and "AutoFeatureExtractor" in config.auto_map:
+                feature_extractor_auto_map = config.auto_map["AutoFeatureExtractor"]
 
-        kwargs["_from_auto"] = True
-        config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
+        if feature_extractor_class is not None:
+            # If we have custom code for a feature extractor, we get the proper class.
+            if feature_extractor_auto_map is not None:
+                if not trust_remote_code:
+                    raise ValueError(
+                        f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file "
+                        "in that repo on your local machine. Make sure you have read the code there to avoid "
+                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
+                    )
+                if kwargs.get("revision", None) is None:
+                    logger.warning(
+                        "Explicitly passing a `revision` is encouraged when loading a feature extractor with custom "
+                        "code to ensure no malicious code has been contributed in a newer revision."
+                    )
 
-        model_type = config_class_to_model_type(type(config).__name__)
+                module_file, class_name = feature_extractor_auto_map.split(".")
+                feature_extractor_class = get_class_from_dynamic_module(
+                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+                )
+            else:
+                feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class)
 
-        if "feature_extractor_type" in config_dict:
-            feature_extractor_class = feature_extractor_class_from_name(config_dict["feature_extractor_type"])
             return feature_extractor_class.from_dict(config_dict, **kwargs)
-        elif model_type is not None:
-            return FEATURE_EXTRACTOR_MAPPING[type(config)].from_dict(config_dict, **kwargs)
+        # Last try: we use the FEATURE_EXTRACTOR_MAPPING.
+        elif type(config) in FEATURE_EXTRACTOR_MAPPING:
+            feature_extractor_class = FEATURE_EXTRACTOR_MAPPING[type(config)]
+            return feature_extractor_class.from_dict(config_dict, **kwargs)
 
         raise ValueError(
-            f"Unrecognized feature extractor in {pretrained_model_name_or_path}. Should have a `feature_extractor_type` key in "
-            f"its {FEATURE_EXTRACTOR_NAME}, or one of the following `model_type` keys in its {CONFIG_NAME}: "
-            f"{', '.join(c for c in FEATURE_EXTRACTOR_MAPPING_NAMES.keys())}"
+            f"Unrecognized feature extractor in {pretrained_model_name_or_path}. Should have a "
+            f"`feature_extractor_type` key in its {FEATURE_EXTRACTOR_NAME} of {CONFIG_NAME}, or one of the following "
+            "`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in FEATURE_EXTRACTOR_MAPPING_NAMES.keys())}"
         )
+
+    @staticmethod
+    def register(config_class, feature_extractor_class):
+        """
+        Register a new feature extractor for this class.
+
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            feature_extractor_class ([`FeatureExtractorMixin`]): The feature extractor to register.
+        """
+        FEATURE_EXTRACTOR_MAPPING.register(config_class, feature_extractor_class)
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index ab2cc3c7e40e2b..b80387cd38aef6 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -29,6 +29,7 @@
     [
         # Base model mapping
         ("maskformer", "MaskFormerModel"),
+        ("poolformer", "PoolFormerModel"),
         ("convnext", "ConvNextModel"),
         ("yoso", "YosoModel"),
         ("swin", "SwinModel"),
@@ -276,6 +277,7 @@
         ),
         ("swin", "SwinForImageClassification"),
         ("convnext", "ConvNextForImageClassification"),
+        ("poolformer", "PoolFormerForImageClassification"),
     ]
 )
 
@@ -283,8 +285,7 @@
     [
         # Do not add new models here, this class will be deprecated in the future.
         # Model for Image Segmentation mapping
-        ("detr", "DetrForSegmentation"),
-        ("maskformer", ("MaskFormerForSemanticSegmentation", "MaskFormerForPanopticSegmentation")),
+        ("detr", "DetrForSegmentation")
     ]
 )
 
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 5a788e16b8ac74..68b846da965bd7 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -20,19 +20,22 @@
 
 # Build the list of all feature extractors
 from ...configuration_utils import PretrainedConfig
+from ...dynamic_module_utils import get_class_from_dynamic_module
 from ...feature_extraction_utils import FeatureExtractionMixin
 from ...file_utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo
 from ...tokenization_utils import TOKENIZER_CONFIG_FILE
+from ...utils import logging
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
     CONFIG_MAPPING_NAMES,
     AutoConfig,
-    config_class_to_model_type,
     model_type_to_module_name,
     replace_list_option_in_docstrings,
 )
 
 
+logger = logging.get_logger(__name__)
+
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("clip", "CLIPProcessor"),
@@ -57,7 +60,10 @@ def processor_class_from_name(class_name: str):
 
             module = importlib.import_module(f".{module_name}", "transformers.models")
             return getattr(module, class_name)
-            break
+
+    for processor in PROCESSOR_MAPPING._extra_content.values():
+        if getattr(processor, "__name__", None) == class_name:
+            return processor
 
     return None
 
@@ -120,6 +126,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
                 consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
                 `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
+                execute code present on the Hub on your local machine.
             kwargs (`Dict[str, Any]`, *optional*):
                 The values in kwargs of any keys which are feature extractor attributes will be used to override the
                 loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
@@ -143,10 +153,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> processor = AutoProcessor.from_pretrained("./test/saved_model/")
         ```"""
         config = kwargs.pop("config", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", False)
         kwargs["_from_auto"] = True
 
+        processor_class = None
+        processor_auto_map = None
+
         # First, let's see if we have a preprocessor config.
-        # Filter the kwargs for `get_file_from_repo``.
+        # Filter the kwargs for `get_file_from_repo`.
         get_file_from_repo_kwargs = {
             key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs
         }
@@ -156,35 +170,63 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         )
         if preprocessor_config_file is not None:
             config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
-            if "processor_class" in config_dict:
-                processor_class = processor_class_from_name(config_dict["processor_class"])
-                return processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        # Next, let's check whether the processor class is saved in a tokenizer
-        # Let's start by checking whether the processor class is saved in a feature extractor
-        tokenizer_config_file = get_file_from_repo(
-            pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, **get_file_from_repo_kwargs
-        )
-        if tokenizer_config_file is not None:
-            with open(tokenizer_config_file, encoding="utf-8") as reader:
-                config_dict = json.load(reader)
-
-            if "processor_class" in config_dict:
-                processor_class = processor_class_from_name(config_dict["processor_class"])
-                return processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        # Otherwise, load config, if it can be loaded.
-        if not isinstance(config, PretrainedConfig):
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        model_type = config_class_to_model_type(type(config).__name__)
-
-        if getattr(config, "processor_class", None) is not None:
-            processor_class = processor_class_from_name(config.processor_class)
-            return processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        model_type = config_class_to_model_type(type(config).__name__)
-        if model_type is not None:
+            processor_class = config_dict.get("processor_class", None)
+            if "AutoProcessor" in config_dict.get("auto_map", {}):
+                processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
+
+        if processor_class is None:
+            # Next, let's check whether the processor class is saved in a tokenizer
+            tokenizer_config_file = get_file_from_repo(
+                pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, **get_file_from_repo_kwargs
+            )
+            if tokenizer_config_file is not None:
+                with open(tokenizer_config_file, encoding="utf-8") as reader:
+                    config_dict = json.load(reader)
+
+                processor_class = config_dict.get("processor_class", None)
+                if "AutoProcessor" in config_dict.get("auto_map", {}):
+                    processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
+
+        if processor_class is None:
+            # Otherwise, load config, if it can be loaded.
+            if not isinstance(config, PretrainedConfig):
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+                )
+
+            # And check if the config contains the processor class.
+            processor_class = getattr(config, "processor_class", None)
+            if hasattr(config, "auto_map") and "AutoProcessor" in config.auto_map:
+                processor_auto_map = config.auto_map["AutoProcessor"]
+
+        if processor_class is not None:
+            # If we have custom code for a feature extractor, we get the proper class.
+            if processor_auto_map is not None:
+                if not trust_remote_code:
+                    raise ValueError(
+                        f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file "
+                        "in that repo on your local machine. Make sure you have read the code there to avoid "
+                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
+                    )
+                if kwargs.get("revision", None) is None:
+                    logger.warning(
+                        "Explicitly passing a `revision` is encouraged when loading a feature extractor with custom "
+                        "code to ensure no malicious code has been contributed in a newer revision."
+                    )
+
+                module_file, class_name = processor_auto_map.split(".")
+                processor_class = get_class_from_dynamic_module(
+                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+                )
+            else:
+                processor_class = processor_class_from_name(processor_class)
+
+            return processor_class.from_pretrained(
+                pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+
+        # Last try: we use the PROCESSOR_MAPPING.
+        if type(config) in PROCESSOR_MAPPING:
             return PROCESSOR_MAPPING[type(config)].from_pretrained(pretrained_model_name_or_path, **kwargs)
 
         raise ValueError(
@@ -192,3 +234,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             f"its {FEATURE_EXTRACTOR_NAME}, or one of the following `model_type` keys in its {CONFIG_NAME}: "
             f"{', '.join(c for c in PROCESSOR_MAPPING_NAMES.keys())}"
         )
+
+    @staticmethod
+    def register(config_class, processor_class):
+        """
+        Register a new processor for this class.
+
+        Args:
+            config_class ([`PretrainedConfig`]):
+                The configuration corresponding to the model to register.
+            processor_class ([`FeatureExtractorMixin`]): The processor to register.
+        """
+        PROCESSOR_MAPPING.register(config_class, processor_class)
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 043843fd52db7f..41d44c641f3348 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -469,7 +469,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         # Next, let's try to use the tokenizer_config file to get the tokenizer class.
         tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
         config_tokenizer_class = tokenizer_config.get("tokenizer_class")
-        tokenizer_auto_map = tokenizer_config.get("auto_map")
+        tokenizer_auto_map = None
+        if "auto_map" in tokenizer_config:
+            if isinstance(tokenizer_config["auto_map"], (tuple, list)):
+                # Legacy format for dynamic tokenizers
+                tokenizer_auto_map = tokenizer_config["auto_map"]
+            else:
+                tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
 
         # If that did not work, let's try to use the config.
         if config_tokenizer_class is None:
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index b9abc647abd653..058fdb99f29857 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -44,8 +44,8 @@
     TFWrappedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_bart import BartConfig
 
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 7d7d431c7ec2c0..bf5ddb365b872d 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -57,8 +57,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_bert import BertConfig
 
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index 6d50492062f294..65135a1d07960d 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -46,8 +46,8 @@
     TFWrappedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_blenderbot import BlenderbotConfig
 
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index fdf0c63c0ae73c..0243030a43015c 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -44,8 +44,8 @@
     TFWrappedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_blenderbot_small import BlenderbotSmallConfig
 
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index 3a1621ba9d9cc9..4902248b256767 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -39,8 +39,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index 84967b5fba1cb7..0c4d265dcd75cc 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -43,8 +43,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_convbert import ConvBertConfig
 
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index acfce53c8a75ec..c72448310a8550 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -30,8 +30,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_ctrl import CTRLConfig
 
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 25a6c07d42cc85..0d36de4895a8e2 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -39,8 +39,8 @@
     TFTokenClassificationLoss,
     get_initializer,
     input_processing,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_deberta import DebertaConfig
 
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 1a8f8c94ba9571..445cb76256bb7a 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -38,8 +38,8 @@
     TFTokenClassificationLoss,
     get_initializer,
     input_processing,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_deberta_v2 import DebertaV2Config
 
diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py
index 28140c75e5a83d..e04f7f2eec10de 100644
--- a/src/transformers/models/detr/feature_extraction_detr.py
+++ b/src/transformers/models/detr/feature_extraction_detr.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 and HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 05da8b30617900..86a814a749bdf2 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -45,8 +45,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_distilbert import DistilBertConfig
 
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index f24b003b60125d..68c639de91beae 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -50,8 +50,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_electra import ElectraConfig
 
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 8ba4ae31b83a24..a2668b75b117a7 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -30,13 +30,8 @@
     replace_return_docstrings,
 )
 from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
-from ...modeling_tf_utils import (
-    TFCausalLanguageModelingLoss,
-    TFPreTrainedModel,
-    get_initializer,
-    input_processing,
-    shape_list,
-)
+from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, input_processing
+from ...tf_utils import shape_list
 from ...utils import logging
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index 87c1c7e6b0423c..c681277a8076da 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -38,8 +38,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from ..xlm.modeling_tf_xlm import (
     TFXLMForMultipleChoice,
diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py
index 9496a63f48d247..5684427cb7a702 100644
--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@@ -77,8 +77,7 @@ class FunnelConfig(PretrainedConfig):
         type_vocab_size (`int`, *optional*, defaults to 3):
             The vocabulary size of the `token_type_ids` passed when calling [`FunnelModel`] or [`TFFunnelModel`].
         initializer_range (`float`, *optional*, defaults to 0.1):
-            The standard deviation of the *uniform initializer* for initializing all weight matrices in attention
-            layers.
+            The upper bound of the *uniform initializer* for initializing all weight matrices in attention layers.
         initializer_std (`float`, *optional*):
             The standard deviation of the *normal initializer* for initializing the embedding matrix and the weight of
             linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index b3d9a8506eb721..9b4b6e7083caf6 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -47,8 +47,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_funnel import FunnelConfig
 
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index ab32cc0e831486..d4939594d5ea2d 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -44,8 +44,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_gpt2 import GPT2Config
 
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 548ea5e3856e88..936f2ab0dc22e9 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -28,13 +28,8 @@
     replace_return_docstrings,
 )
 from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
-from ...modeling_tf_utils import (
-    TFPreTrainedModel,
-    booleans_processing,
-    get_initializer,
-    keras_serializable,
-    shape_list,
-)
+from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable
+from ...tf_utils import shape_list
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
 from .configuration_hubert import HubertConfig
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index dbc9b21b0bdafa..6f308835007e39 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -39,8 +39,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_layoutlm import LayoutLMConfig
 
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 924a62f7d99fc3..e282db0e811fe7 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -39,8 +39,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_led import LEDConfig
 
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index da34d11b80b1a5..458133a9b463c1 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -38,8 +38,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_longformer import LongformerConfig
 
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index be9be08fb17148..ba094d6a0a7e30 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -45,8 +45,8 @@
     TFWrappedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_marian import MarianConfig
 
diff --git a/src/transformers/models/maskformer/__init__.py b/src/transformers/models/maskformer/__init__.py
index cb38586d75ed9a..c4237f91dc61a1 100644
--- a/src/transformers/models/maskformer/__init__.py
+++ b/src/transformers/models/maskformer/__init__.py
@@ -33,6 +33,7 @@
         "MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
         "MaskFormerForInstanceSegmentation",
         "MaskFormerModel",
+        "MaskFormerPretrainedModel",
     ]
 
 if TYPE_CHECKING:
@@ -45,6 +46,7 @@
             MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             MaskFormerForInstanceSegmentation,
             MaskFormerModel,
+            MaskFormerPretrainedModel,
         )
 
 
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index 5adc71104467a5..8a8ba17df527d8 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -13,10 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ MaskFormer model configuration"""
-from __future__ import annotations
 import copy
-
-from typing import Dict, Optional
+from typing import Dict
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -26,7 +24,7 @@
 
 
 MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = [
-    "facebook/maskformer-swin-base-ade-640",
+    "facebook/maskformer-swin-base-ade",
     # See all MaskFormer models at https://huggingface.co/models?filter=maskformer
 ]
 
@@ -37,14 +35,15 @@ class MaskFormerConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`MaskFormer`]. It is used to instantiate a
     MaskFormer model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the "maskformer-swin-base-ade-640"
-    architecture trained on ade20k-150
+    configuration with the defaults will yield a similar configuration to that of the
+    "Francesco/maskformer-swin-base-ade" architecture trained on ADE20k-150
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
+    Currently, maskformer supports only Swin backbone.
+
     Args:
-        dataset_metadata (DatasetMetadata, optional): [description]. Defaults to None.
         mask_feature_size (Optional[int], optional):
             The masks' features size, this value will also be used to specify the Feature Pyramid Network features
             size. Defaults to 256.
@@ -61,17 +60,17 @@ class MaskFormerConfig(PretrainedConfig):
         mask_weight (Optional[float], optional): [description]. Defaults to 20.0.
 
     Raises:
-        ValueError: Raised if the backbone model type selected is not in `MaskFormerConfig.backbones_supported`
+        `ValueError`: Raised if the backbone model type selected is not in `MaskFormerConfig.backbones_supported`
 
     Examples:
 
     ```python
-    >>> from transformers import MaskFormerModel, MaskFormerConfig
+    >>> from transformers import MaskFormerConfig, MaskFormerModel
 
-    >>> # Initializing a maskFormer facebook/maskformer-swin-base-ade-640 configuration
+    >>> # Initializing a MaskFormer facebook/maskformer-swin-base-ade configuration
     >>> configuration = MaskFormerConfig()
 
-    >>> # Initializing a model from the facebook/maskformer-swin-base-ade-640 style configuration
+    >>> # Initializing a model from the facebook/maskformer-swin-base-ade style configuration
     >>> model = MaskFormerModel(configuration)
 
     >>> # Accessing the model configuration
@@ -80,25 +79,23 @@ class MaskFormerConfig(PretrainedConfig):
 
     """
     model_type = "maskformer"
-
-    attribute_map = {"hidden_size": "d_model"}
-
+    attribute_map = {"hidden_size": "mask_feature_size"}
     backbones_supported = ["swin"]
 
     def __init__(
         self,
-        fpn_feature_size: Optional[int] = 256,
-        mask_feature_size: Optional[int] = 256,
-        no_object_weight: Optional[float] = 0.1,
-        use_auxilary_loss: Optional[bool] = False,
-        backbone_config: Optional[Dict] = None,
-        detr_config: Optional[Dict] = None,
+        fpn_feature_size: int = 256,
+        mask_feature_size: int = 256,
+        no_object_weight: float = 0.1,
+        use_auxilary_loss: bool = False,
+        backbone_config: Dict = None,
+        detr_config: Dict = None,
         init_std: float = 0.02,
         init_xavier_std: float = 1.0,
-        dice_weight: Optional[float] = 1.0,
-        cross_entropy_weight: Optional[float] = 1.0,
-        mask_weight: Optional[float] = 20.0,
-        num_labels: Optional[int] = 150,
+        dice_weight: float = 1.0,
+        cross_entropy_weight: float = 1.0,
+        mask_weight: float = 20.0,
+        num_labels: int = 150,
         **kwargs,
     ):
         if backbone_config is None:
@@ -121,20 +118,13 @@ def __init__(
                 )
             backbone_config = AutoConfig.for_model(backbone_model_type, **backbone_config)
 
-        if detr_config is None:
-            detr_config = DetrConfig()
-
-        else:
-            detr_config = DetrConfig(**detr_config)
+        detr_config = DetrConfig() if detr_config is None else DetrConfig(**detr_config)
 
         self.backbone_config = backbone_config
-
         self.detr_config = detr_config
-
+        # main feature dimension for the model
         self.fpn_feature_size = fpn_feature_size
         self.mask_feature_size = mask_feature_size
-        self.no_object_weight = no_object_weight
-        self.use_auxilary_loss = use_auxilary_loss
         # initializer
         self.init_std = init_std
         self.init_xavier_std = init_xavier_std
@@ -142,13 +132,12 @@ def __init__(
         self.cross_entropy_weight = cross_entropy_weight
         self.dice_weight = dice_weight
         self.mask_weight = mask_weight
-
+        self.use_auxilary_loss = use_auxilary_loss
+        self.no_object_weight = no_object_weight
         super().__init__(num_labels=num_labels, **kwargs)
 
     @classmethod
-    def from_backbone_and_detr_configs(
-        cls, backbone_config: PretrainedConfig, detr_config: DetrConfig, **kwargs
-    ) -> MaskFormerConfig:
+    def from_backbone_and_detr_configs(cls, backbone_config: PretrainedConfig, detr_config: DetrConfig, **kwargs):
         """Instantiate a [`MaskFormerConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model
         configuration.
 
@@ -159,11 +148,7 @@ def from_backbone_and_detr_configs(
                 Returns:
                     [`MaskFormerConfig`]: An instance of a configuration object
         """
-        return cls(
-            backbone_config=backbone_config.to_dict(),
-            detr_config=detr_config.to_dict(),
-            **kwargs,
-        )
+        return cls(backbone_config=backbone_config.to_dict(), detr_config=detr_config.to_dict(), **kwargs)
 
     @property
     def num_attention_heads(self) -> int:
diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
index 11ec902266d2b2..5fd1cb5eaab4ef 100644
--- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
@@ -1,6 +1,17 @@
-from __future__ import annotations
-
-import logging
+# coding=utf-8
+# Copyright 2022 Facebook AI Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from argparse import ArgumentParser
 from dataclasses import dataclass
 from pathlib import Path
@@ -25,13 +36,11 @@
     MaskFormerForInstanceSegmentationOutput,
     MaskFormerModel,
     MaskFormerOutput,
-    MaskFormerPixelLevelModuleOutput,
 )
+from transformers.utils import logging
 
 
 StateDict = Dict[str, Tensor]
-from transformers.utils import logging
-
 
 logging.set_verbosity_info()
 logger = logging.get_logger()
@@ -326,20 +335,6 @@ def replace_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict,
                     ),
                 ]
             )
-
-        # model.layernorm.weight and our hiddin_state_norms[3] have to be the same
-        # assert torch.allclose(
-        #     dst_state_dict[f"{dst_prefix}.hidden_states_norms.3.weight"],
-        #     dst_state_dict[f"{dst_prefix}.model.layernorm.weight"],
-        # )
-        # dst_state_dict[f"{dst_prefix}.hidden_states_norms.3.weight"].copy_(
-        #     dst_state_dict[f"{dst_prefix}.model.layernorm.weight"]
-        # )
-
-        # dst_state_dict[f"{dst_prefix}.hidden_states_norms.3.bias"].copy_(
-        #     dst_state_dict[f"{dst_prefix}.model.layernorm.bias"]
-        # )
-
         self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
 
     def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
@@ -538,9 +533,7 @@ def convert_instance_segmentation(
         return mask_former
 
     @staticmethod
-    def using_dirs(
-        checkpoints_dir: Path, config_dir: Path
-    ) -> Iterator[Tuple[OriginalMaskFormerCheckpoinToOursConverter, Path, Path]]:
+    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
         checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")
 
         for checkpoint in checkpoints:
@@ -584,7 +577,7 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation):
             original_model_backbone_features.values(), our_model_output.encoder_hidden_states
         ):
 
-            assert torch.allclose(original_model_feature, our_model_feature, atol=1e-2)
+            assert torch.allclose(original_model_feature, our_model_feature, atol=1e-3)
 
         original_model_pixel_out = original_model.sem_seg_head.pixel_decoder.forward_features(
             original_model_backbone_features
@@ -603,11 +596,33 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation):
 
         our_segmentation = feature_extractor.post_process_segmentation(our_model_out, target_size=(384, 384))
 
-        assert torch.allclose(original_segmentation, our_segmentation, atol=1e-2)
+        assert torch.allclose(original_segmentation, our_segmentation, atol=1e-3)
 
         logger.info("✅ Test passed!")
 
 
+def get_name(checkpoint_file: Path):
+    model_name_raw: str = checkpoint_file.stem
+    # model_name_raw is something like maskformer_panoptic_swin_base_IN21k_384_bs64_554k
+    parent_name: str = checkpoint_file.parents[0].stem
+    backbone = "swin"
+    dataset = ""
+    if "coco" in parent_name:
+        dataset = "coco"
+    elif "ade" in parent_name:
+        dataset = "ade"
+    else:
+        raise ValueError(f"{parent_name} must be wrong since we didn't find 'coco' or 'ade' in it ")
+
+    backbone_types = ["tiny", "small", "base", "large"]
+
+    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]
+
+    model_name = f"maskformer-{backbone}-{backbone_type}-{dataset}"
+
+    return model_name
+
+
 if __name__ == "__main__":
 
     parser = ArgumentParser(
@@ -626,7 +641,7 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation):
     )
     parser.add_argument(
         "--pytorch_dump_folder_path",
-        default=Path("/tmp/hf/models"),
+        default=Path("/home/zuppif/Desktop/hf/models"),
         type=Path,
         help="Path to the folder to output PyTorch models.",
     )
@@ -675,9 +690,21 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation):
 
         test(original_model, mask_former_for_instance_segmentation)
 
-        model_name: str = f"{checkpoint_file.parents[0].stem.replace('-', '_')}-{checkpoint_file.stem}"
-
-        logger.info(model_name)
+        model_name = get_name(checkpoint_file)
+        logger.info(f"🪄 Saving {model_name}")
 
         feature_extractor.save_pretrained(save_directory / model_name)
         mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name)
+
+        feature_extractor.push_to_hub(
+            repo_path_or_name=save_directory / model_name,
+            organization="Francesco",
+            commit_message="Add model",
+            use_temp_dir=True,
+        )
+        mask_former_for_instance_segmentation.push_to_hub(
+            repo_path_or_name=save_directory / model_name,
+            organization="Francesco",
+            commit_message="Add model",
+            use_temp_dir=True,
+        )
diff --git a/src/transformers/models/maskformer/feature_extraction_maskformer.py b/src/transformers/models/maskformer/feature_extraction_maskformer.py
index 25d054d5ab4588..cc0ae630c05fc7 100644
--- a/src/transformers/models/maskformer/feature_extraction_maskformer.py
+++ b/src/transformers/models/maskformer/feature_extraction_maskformer.py
@@ -18,26 +18,23 @@
 
 import numpy as np
 from PIL import Image
-from torch.nn.functional import interpolate
 
-from transformers.models.maskformer.modeling_maskformer import MaskFormerForInstanceSegmentationOutput, upsample_like
+from transformers.models.maskformer.modeling_maskformer import MaskFormerForInstanceSegmentationOutput
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from ...file_utils import TensorType, is_torch_available
-from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
+from ...image_utils import ImageFeatureExtractionMixin, ImageInput, is_torch_tensor
 from ...utils import logging
 
 
 if is_torch_available():
     import torch
     from torch import Tensor, nn
+    from torch.nn.functional import interpolate
 
 logger = logging.get_logger(__name__)
 
 
-ImageInput = Union[Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"]]
-
-
 class MaskFormerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a MaskFormer feature extractor.
@@ -47,8 +44,6 @@ class MaskFormerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
 
 
     Args:
-        format (`str`, *optional*, defaults to `"coco_detection"`):
-            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the input to a certain `size`.
         size (`int`, *optional*, defaults to 800):
@@ -197,7 +192,7 @@ def __call__(
 
             - **pixel_values** -- Pixel values to be fed to a model.
             - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
-              *"pixel_mask"* is in `self.model_input_names`).
+              `"pixel_mask"` is in `self.model_input_names`).
             - **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
         """
         # Input type checking for clearer error
@@ -234,11 +229,9 @@ def __call__(
             if not valid_annotations:
                 raise ValueError(
                     """
-                    Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). In case of object
-                    detection, each dictionary should contain the keys 'image_id' and 'annotations', with the latter
-                    being a list of annotations in COCO format. In case of panoptic segmentation, each dictionary
-                    should contain the keys 'file_name', 'image_id' and 'segments_info', with the latter being a list
-                    of annotations in COCO format.
+                    Annotations must of type `Dict` (single image) or `List[Dict]` (batch of images). The annotations
+                    must be numpy arrays in the following format: { "masks" : the target mask, with shape [C,H,W],
+                    "labels" : the target labels, with shape [C]}
                     """
                 )
 
@@ -321,7 +314,7 @@ def encode_inputs(
         """
 
         max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list])
-        c, height, width = max_size
+        channels, height, width = max_size
         pixel_values = []
         pixel_mask = []
         mask_labels = []
@@ -330,7 +323,7 @@ def encode_inputs(
         for idx, image in enumerate(pixel_values_list):
             # create padded image
             if pad_and_return_pixel_mask:
-                padded_image = np.zeros((c, height, width), dtype=np.float32)
+                padded_image = np.zeros((channels, height, width), dtype=np.float32)
                 padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image)
                 image = padded_image
             pixel_values.append(image)
@@ -351,10 +344,7 @@ def encode_inputs(
                 pixel_mask.append(mask)
 
         # return as BatchFeature
-        data = {
-            "pixel_values": pixel_values,
-            "pixel_mask": pixel_mask,
-        }
+        data = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
 
         if annotations:
             data["mask_labels"] = mask_labels
@@ -367,14 +357,15 @@ def encode_inputs(
     def post_process_segmentation(
         self, outputs: MaskFormerForInstanceSegmentationOutput, target_size: Tuple[int, int] = None
     ) -> Tensor:
-        """Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image segmentation predictions. Only supports
-        PyTorch.
+        """
+        Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image segmentation predictions. Only
+        supports PyTorch.
 
-                Args:
-                    outputs (MaskFormerForInstanceSegmentationOutput): The outputs from MaskFor
+        Args:
+            outputs (MaskFormerForInstanceSegmentationOutput): The outputs from MaskFor
 
-                Returns:
-                    Tensor: A tensor of shape `batch_size, num_labels, height, width`
+        Returns:
+            Tensor: A tensor of shape `batch_size, num_labels, height, width`
         """
         # class_queries_logitss has shape [BATCH, QUERIES, CLASSES + 1]
         class_queries_logits = outputs.class_queries_logits
@@ -404,14 +395,13 @@ def remove_low_and_no_objects(
         self, masks: Tensor, scores: Tensor, labels: Tensor, object_mask_threshold: float, num_labels: int
     ) -> Tuple[Tensor, Tensor, Tensor]:
         """
-
         Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores`
         and `labels`
 
         Args:
-            masks (Tensor): A tensor of shape `(num_queries, height, width)`
-            scores (Tensor): A tensor of shape `(num_queries)`
-            labels (Tensor): A tensor of shape `(num_queries)`
+            masks (`torch.Tensor`): A tensor of shape `(num_queries, height, width)`
+            scores (`torch.Tensor`): A tensor of shape `(num_queries)`
+            labels (`torch.Tensor`): A tensor of shape `(num_queries)`
             object_mask_threshold (float): A number between 0 and 1 used to binarize the masks
 
         Raises:
@@ -430,14 +420,15 @@ def remove_low_and_no_objects(
     def post_process_semantic_segmentation(
         self, outputs: MaskFormerForInstanceSegmentationOutput, target_size: Tuple[int, int] = None
     ) -> Tensor:
-        """Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into semantic segmentation predictions. Only
+        """
+        Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into semantic segmentation predictions. Only
         supports PyTorch.
 
-                Args:
-                    outputs (MaskFormerForInstanceSegmentationOutput): The outputs from MaskFor
+        Args:
+            outputs (`MaskFormerForInstanceSegmentationOutput`): The outputs from MaskFormerForInstanceSegmentation
 
-                Returns:
-                    Tensor: A tensor of shape `batch_size, height, width`
+        Returns:
+            Tensor: A tensor of shape `batch_size, height, width`
         """
         segmentation: Tensor = self.post_process_segmentation(outputs, target_size)
         semantic_segmentation: Tensor = segmentation.argmax(dim=1)
@@ -454,9 +445,8 @@ def post_process_panoptic_segmentation(
         Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image panoptic segmentation
         predictions. Only supports PyTorch.
 
-
         Args:
-            outputs (MaskFormerForInstanceSegmentationOutput): [description]
+            outputs (`MaskFormerForInstanceSegmentationOutput`): [description]
             object_mask_threshold (Optional[float], optional): [description]. Defaults to 0.8.
             overlap_mask_area_threshold (Optional[float], optional): [description]. Defaults to 0.8.
             is_thing_map (Dict[int, bool], optional): [description].
@@ -490,7 +480,6 @@ def post_process_panoptic_segmentation(
         mask_probs = masks_queries_logits.sigmoid()
         # mask probs has shape [BATCH, QUERIES, HEIGHT, WIDTH]
         # now, we need to iterate over the batch size to correctly process the segmentation we got from the queries using our thresholds. Even if the original predicted masks have the same shape across the batch, they won't after thresholding so batch-wise operations are impossible
-
         results: List[Dict[str, Tensor]] = []
         for (mask_probs, pred_scores, pred_labels) in zip(mask_probs, pred_scores, pred_labels):
 
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 1053c25e2efda5..f31f8b6faf8e08 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -14,34 +14,35 @@
 # limitations under the License.
 """ PyTorch MaskFormer model."""
 
-from __future__ import annotations
-
-import logging
+import collections.abc
+import math
+import random
 from dataclasses import dataclass
 from numbers import Number
-from optparse import Option
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
+import torch.distributed as dist
 from torch import Tensor, nn
 from torch.nn.functional import binary_cross_entropy_with_logits, cross_entropy, interpolate
 
-from transformers.modeling_outputs import BaseModelOutput
+from transformers.utils import logging
 
+from ...activations import ACT2FN
 from ...file_utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_scipy_available,
     replace_return_docstrings,
     requires_backends,
-    is_scipy_available,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
-from ..detr.modeling_detr import DetrDecoder, DetrDecoderOutput
-from ..swin import SwinConfig, SwinModel
+from ...modeling_outputs import BaseModelOutputWithCrossAttentions
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ..detr import DetrConfig
+from ..swin import SwinConfig
 from .configuration_maskformer import MaskFormerConfig
 
 
@@ -49,15 +50,14 @@
     from scipy.optimize import linear_sum_assignment
 
 logger = logging.get_logger(__name__)
-import torch.distributed as dist
 
 
 _CONFIG_FOR_DOC = "MaskFormerConfig"
-_CHECKPOINT_FOR_DOC = "facebook/maskformer-swin-base-ade-640"
+_CHECKPOINT_FOR_DOC = "facebook/maskformer-swin-base-ade"
 _FEAT_EXTRACTOR_FOR_DOC = "MaskFormerFeatureExtractor"
 
 MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/maskformer-swin-base-ade-640",
+    "facebook/maskformer-swin-base-ade",
     # See all MaskFormer models at https://huggingface.co/models?filter=maskformer
 ]
 
@@ -71,27 +71,142 @@ def get_world_size() -> int:
     return dist.get_world_size()
 
 
+# This was copied from original implementation
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+@dataclass
+class MaskFormerSwinModelOutputWithPooling(ModelOutput):
+    """
+    Class for MaskFormerSwinModel's outputs that also contains the spatial dimensions of the hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*, a tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to `batch, channels, height, width`. Due to padding, their spatial size cannot inferred before the `forward` method:
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states_spatial_dimensions: Tuple[Tuple[int, int]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class MaskFormerSwinBaseModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*, a tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to `batch, channels, height, width`. Due to padding, their spatial size cannot inferred before the `forward` method:
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states_spatial_dimensions: Tuple[Tuple[int, int]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.detr.modeling_detr.DetrDecoderOutput
+class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
+    """
+    Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+            layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+
+
 @dataclass
 class MaskFormerPixelLevelModuleOutput(ModelOutput):
     """MaskFormer's pixel level module output. It returns both the last and (optionally) the hidden states from the `encoder`
-    and `decoder`. By default, the `encoder` is a Swin Transformer and the `decoder` is a Feature Pyramid Network
-    (FPN).
-
-        The `encoder_last_hidden_state` are referred on the paper as **images features**, while
-        `decoder_last_hidden_state` as **pixel embeddings**
-         Args:
-            encoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
-                Last hidden states (final feature map) of the last stage of the encoder.
-            encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-                Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage)
-                of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the
-                model at the output of each stage.
-            decoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
-                Last hidden states (final feature map) of the last stage of the decoder.
-            decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-                Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage)
-                of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the
-                model at the output of each stage.
+    and `decoder`. By default, the `encoder` is a MaskFormerSwin Transformer and the `decoder` is a Feature Pyramid
+    Network (FPN).
+
+    The `encoder_last_hidden_state` are referred on the paper as **images features**, while `decoder_last_hidden_state`
+    as **pixel embeddings**
+
+    Args:
+        encoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
+            Last hidden states (final feature map) of the last stage of the encoder.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the model at
+            the output of each stage.
+        decoder_last_hidden_state (`torch.FloatTensor` of shape`(batch_size, num_channels, height, width)`):
+            Last hidden states (final feature map) of the last stage of the decoder.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the model at
+            the output of each stage.
     """
 
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
@@ -100,7 +215,7 @@ class MaskFormerPixelLevelModuleOutput(ModelOutput):
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
-class MaskFormerPixelDecoderOutput(BaseModelOutput):
+class MaskFormerPixelDecoderOutput(ModelOutput):
     """
     MaskFormer's pixel decoder module output, practically a Feature Pyramid Network. It returns the last hidden state
     and (optionally) the hidden states.
@@ -115,10 +230,14 @@ class MaskFormerPixelDecoderOutput(BaseModelOutput):
             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
     """
 
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
 
 @dataclass
 class MaskFormerOutput(ModelOutput):
-    """Base class for outputs of MaskFormer model. This class returns all the needed hidden states to compute the logits.
+    """Class for outputs of [`MaskFormerModel`]. This class returns all the needed hidden states to compute the logits.
 
     Args:
         encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
@@ -144,7 +263,6 @@ class MaskFormerOutput(ModelOutput):
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     pixel_decoder_last_hidden_state: Optional[torch.FloatTensor] = None
     transformer_decoder_last_hidden_state: Optional[torch.FloatTensor] = None
-
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     transformer_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -157,7 +275,7 @@ class MaskFormerForInstanceSegmentationOutput(ModelOutput):
 
     This output can be directly passed to [`~MaskFormerFeatureExtractor.post_process_segmentation`] or
     [`~MaskFormerFeatureExtractor.post_process_panoptic_segmentation`] depending on the task. Please, see
-    [`~MaskFormerFeatureExtractor] for a detail usage.
+    [`~MaskFormerFeatureExtractor] for details regarding usage.
 
     Args:
         class_queries_logits (torch.FloatTensor):
@@ -201,29 +319,24 @@ class MaskFormerForInstanceSegmentationOutput(ModelOutput):
     loss_dict: Optional[Dict[str, torch.FloatTensor]] = None
 
 
-# copied from original implementation
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-
-def upsample_like(x: Tensor, like: Tensor, mode: str = "bilinear") -> Tensor:
-    """An utility function that upsamples `x` to match the dimension of `like`
+def upsample_like(pixel_values: Tensor, like: Tensor, mode: str = "bilinear") -> Tensor:
+    """
+    An utility function that upsamples `pixel_values` to match the dimension of `like`
 
     Args:
-        x (Tensor): The tensor we wish to upsample
-        like (Tensor): The tensor we wish to use as size target
-        mode (str, optional): The interpolation mode. Defaults to "bilinear".
+        pixel_values (`torch.Tensor`):
+            The tensor we wish to upsample
+        like (`torch.Tensor`):
+            The tensor we wish to use as size target
+        mode (str, *optional*):
+            The interpolation mode. Defaults to "bilinear".
 
     Returns:
         Tensor: The upsampled tensor
     """
     _, _, height, width = like.shape
-    upsampled: Tensor = interpolate(
-        x,
+    upsampled = interpolate(
+        pixel_values,
         size=(height, width),
         mode=mode,
         align_corners=False,
@@ -234,7 +347,7 @@ def upsample_like(x: Tensor, like: Tensor, mode: str = "bilinear") -> Tensor:
 # refactored from original implementation
 def dice_loss(inputs: Tensor, labels: Tensor, num_masks: float) -> Tensor:
     r"""
-    Compute the DICE loss, similar to generalized IOU for masks as follow
+    Compute the DICE loss, similar to generalized IOU for masks as follow:
 
     $$
         \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}}
@@ -246,19 +359,19 @@ def dice_loss(inputs: Tensor, labels: Tensor, num_masks: float) -> Tensor:
     $$
 
     Args:
-        inputs (Tensor): A tensor representing a mask
-        labels (Tensor):
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
             A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
             (0 for the negative class and 1 for the positive class).
 
-
     Returns:
-        Tensor: The computed loss
+        `torch.Tensor`: The computed loss.
     """
-    probs: Tensor = inputs.sigmoid().flatten(1)
-    numerator: Tensor = 2 * (probs * labels).sum(-1)
-    denominator: Tensor = probs.sum(-1) + labels.sum(-1)
-    loss: Tensor = 1 - (numerator + 1) / (denominator + 1)
+    probs = inputs.sigmoid().flatten(1)
+    numerator = 2 * (probs * labels).sum(-1)
+    denominator = probs.sum(-1) + labels.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
     loss = loss.sum() / num_masks
     return loss
 
@@ -268,8 +381,8 @@ def sigmoid_focal_loss(
     inputs: Tensor, labels: Tensor, num_masks: int, alpha: float = 0.25, gamma: float = 2
 ) -> Tensor:
     r"""
-       Focal loss proposed in [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) originally used
-       in RetinaNet. The loss is computed as follows
+    Focal loss proposed in [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) originally used in
+    RetinaNet. The loss is computed as follows
 
     $$
          \mathcal{L}_{\text{focal loss} = -(1 - p_t)^{\gamma}\log{(p_t)}
@@ -279,27 +392,27 @@ def sigmoid_focal_loss(
 
     Please refer to equation (1,2,3) of the paper for a better understanding.
 
-
     Args:
-        inputs (Tensor): A float tensor of arbitrary shape.
-                The predictions for each example.
-        labels (Tensor,):
+        inputs (`torch.Tensor`):
+            A float tensor of arbitrary shape.
+        labels (`torch.Tensor`):
             A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
             (0 for the negative class and 1 for the positive class).
-        alpha (float, optional): Weighting factor in range (0,1) to balance
-                positive vs negative examples. Default = -1 (no weighting).
-        gamma (float, optional): Exponent of the modulating factor (1 - p_t) to
-                balance easy vs hard examples.
+        alpha (float, *optional*):
+            Weighting factor in range (0,1) to balance positive vs negative examples. Default = -1 (no weighting).
+        gamma (float, *optional*):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+
     Returns:
-        Tensor: The computed loss
+        `torch.Tensor`: The computed loss
     """
-    probs: Tensor = inputs.sigmoid()
-    cross_entropy_loss: Tensor = binary_cross_entropy_with_logits(inputs, labels, reduction="none")
-    p_t: Tensor = probs * labels + (1 - probs) * (1 - labels)
-    loss: Tensor = cross_entropy_loss * ((1 - p_t) ** gamma)
+    probs = inputs.sigmoid()
+    cross_entropy_loss = binary_cross_entropy_with_logits(inputs, labels, reduction="none")
+    p_t = probs * labels + (1 - probs) * (1 - labels)
+    loss = cross_entropy_loss * ((1 - p_t) ** gamma)
 
     if alpha >= 0:
-        alpha_t: Tensor = alpha * labels + (1 - alpha) * (1 - labels)
+        alpha_t = alpha * labels + (1 - alpha) * (1 - labels)
         loss = alpha_t * loss
 
     loss = loss.mean(1).sum() / num_masks
@@ -312,20 +425,20 @@ def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
     A pair wise version of the dice loss, see `dice_loss` for usage
 
     Args:
-        inputs (Tensor): A tensor representing a mask
-        labels (Tensor):
+        inputs (`torch.Tensor`):
+            A tensor representing a mask
+        labels (`torch.Tensor`):
             A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
             (0 for the negative class and 1 for the positive class).
 
-
     Returns:
-        Tensor: The computed loss between each pairs
+        `torch.Tensor`: The computed loss between each pairs
     """
-    inputs: Tensor = inputs.sigmoid().flatten(1)
-    numerator: Tensor = 2 * torch.einsum("nc,mc->nm", inputs, labels)
+    inputs = inputs.sigmoid().flatten(1)
+    numerator = 2 * torch.einsum("nc,mc->nm", inputs, labels)
     # using broadcasting to get a [NUM_QUERIES, NUM_CLASSES] matrix
-    denominator: Tensor = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
-    loss: Tensor = 1 - (numerator + 1) / (denominator + 1)
+    denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
     return loss
 
 
@@ -335,38 +448,1265 @@ def pair_wise_sigmoid_focal_loss(inputs: Tensor, labels: Tensor, alpha: float =
     A pair wise version of the focal loss, see `sigmoid_focal_loss` for usage
 
     Args:
-        inputs (Tensor): A tensor representing a mask
-        labels (Tensor):
+        inputs (`torch.Tensor`):
+            A tensor representing a mask
+        labels (`torch.Tensor`):
             A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
             (0 for the negative class and 1 for the positive class).
 
-
     Returns:
-        Tensor: The computed loss between each pairs
+        `torch.Tensor`: The computed loss between each pairs
     """
     if alpha < 0:
-        raise ValueError(f"alpha must be positive")
+        raise ValueError("alpha must be positive")
 
-    height_and_width: int = inputs.shape[1]
+    height_and_width = inputs.shape[1]
 
-    prob: Tensor = inputs.sigmoid()
+    prob = inputs.sigmoid()
     cross_entropy_loss_pos = binary_cross_entropy_with_logits(inputs, torch.ones_like(inputs), reduction="none")
-    focal_pos: Tensor = ((1 - prob) ** gamma) * cross_entropy_loss_pos
+    focal_pos = ((1 - prob) ** gamma) * cross_entropy_loss_pos
     focal_pos *= alpha
 
     cross_entropy_loss_neg = binary_cross_entropy_with_logits(inputs, torch.zeros_like(inputs), reduction="none")
 
-    focal_neg: Tensor = (prob**gamma) * cross_entropy_loss_neg
+    focal_neg = (prob**gamma) * cross_entropy_loss_neg
     focal_neg *= 1 - alpha
 
-    loss: Tensor = torch.einsum("nc,mc->nm", focal_pos, labels) + torch.einsum("nc,mc->nm", focal_neg, (1 - labels))
+    loss = torch.einsum("nc,mc->nm", focal_pos, labels) + torch.einsum("nc,mc->nm", focal_neg, (1 - labels))
 
     return loss / height_and_width
 
 
+# Copied from transformers.models.vit.modeling_vit.to_2tuple
+def to_2tuple(x):
+    if isinstance(x, collections.abc.Iterable):
+        return x
+    return (x, x)
+
+
+# Copied from transformers.models.swin.modeling_swin.window_partition
+def window_partition(input_feature, window_size):
+    """
+    Partitions the given input into windows.
+    """
+    batch_size, height, width, num_channels = input_feature.shape
+    input_feature = input_feature.view(
+        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
+    )
+    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows
+
+
+# Copied from transformers.models.swin.modeling_swin.window_reverse
+def window_reverse(windows, window_size, height, width):
+    """
+    Merges windows to produce higher resolution features.
+    """
+    batch_size = int(windows.shape[0] / (height * width / window_size / window_size))
+    windows = windows.view(batch_size, height // window_size, width // window_size, window_size, window_size, -1)
+    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, height, width, -1)
+    return windows
+
+
+# Copied from transformers.models.swin.modeling_swin.drop_path
+def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = input.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return input * random_tensor
+
+
+class MaskFormerSwinEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings for maskformer model.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = MaskFormerSwinPatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.embed_dim,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        self.patch_grid = self.patch_embeddings.grid_size
+
+        if config.use_absolute_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
+        else:
+            self.position_embeddings = None
+
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, pixel_values):
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings, output_dimensions
+
+
+class MaskFormerSwinPatchEmbeddings(nn.Module):
+    """
+    Image to Patch Embedding for maskformer model.
+    """
+
+    def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768):
+        super().__init__()
+        image_size = to_2tuple(image_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def maybe_pad(self, pixel_values, width, height):
+        if width % self.patch_size[1] != 0:
+            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        if height % self.patch_size[0] != 0:
+            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        return pixel_values
+
+    def forward(self, pixel_values):
+        _, _, height, width = pixel_values.shape
+        # pad the input to be divisible by self.patch_size, if needed
+        pixel_values = self.maybe_pad(pixel_values, height, width)
+        embeddings = self.projection(pixel_values)
+        _, _, height, width = embeddings.shape
+        output_dimensions = (height, width)
+        embeddings_flat = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings_flat, output_dimensions
+
+
+class MaskFormerSwinPatchMerging(nn.Module):
+    """
+    Patch Merging Layer for maskformer model.
+
+    Args:
+        input_resolution (`Tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def maybe_pad(self, input_feature, width, height):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
+
+        return input_feature
+
+    def forward(self, input_feature, input_dimensions):
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be disible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]  # batch_size height/2 width/2 num_channels
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]  # batch_size height/2 width/2 num_channels
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]  # batch_size height/2 width/2 num_channels
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]  # batch_size height/2 width/2 num_channels
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
+
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+
+        return input_feature
+
+
+# Copied from transformers.models.swin.modeling_swin.MaskFormerSwinDropPath
+class MaskFormerSwinDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(MaskFormerSwinDropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, input):
+        return drop_path(input, self.drop_prob, self.training, self.scale_by_keep)
+
+
+class MaskFormerSwinSelfAttention(nn.Module):
+    def __init__(self, config, dim, num_heads):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention " f"heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.window_size = to_2tuple(config.window_size)
+
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
+        )
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        batch_size, dim, num_channels = hidden_states.shape
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+        relative_position_bias = relative_position_bias.view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )
+
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in MaskFormerSwinModel forward() function)
+            mask_shape = attention_mask.shape[0]
+            attention_scores = attention_scores.view(
+                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+            )
+            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.MaskFormerSwinSelfOutput
+class MaskFormerSwinSelfOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.MaskFormerSwinAttention
+class MaskFormerSwinAttention(nn.Module):
+    def __init__(self, config, dim, num_heads):
+        super().__init__()
+        self.self = MaskFormerSwinSelfAttention(config, dim, num_heads)
+        self.output = MaskFormerSwinSelfOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.MaskFormerSwinIntermediate
+class MaskFormerSwinIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.MaskFormerSwinOutput
+class MaskFormerSwinOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class MaskFormerSwinBlock(nn.Module):
+    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.shift_size = shift_size
+        self.window_size = config.window_size
+        self.input_resolution = input_resolution
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = MaskFormerSwinAttention(config, dim, num_heads)
+        self.drop_path = (
+            MaskFormerSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        )
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = MaskFormerSwinIntermediate(config, dim)
+        self.output = MaskFormerSwinOutput(config, dim)
+
+    def get_attn_mask(self, input_resolution):
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            height, width = input_resolution
+            img_mask = torch.zeros((1, height, width, 1))
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def maybe_pad(self, hidden_states, height, width):
+        pad_left = pad_top = 0
+        pad_rigth = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        pad_values = (0, 0, pad_left, pad_rigth, pad_top, pad_bottom)
+        hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(self, hidden_states, input_dimensions, head_mask=None, output_attentions=False):
+        height, width = input_dimensions
+        batch_size, dim, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+        hidden_states = hidden_states.view(batch_size, height, width, channels)
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_hidden_states = hidden_states
+
+        # partition windows
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+        attn_mask = self.get_attn_mask((height_pad, width_pad))
+        if attn_mask is not None:
+            attn_mask = attn_mask.to(hidden_states_windows.device)
+
+        self_attention_outputs = self.attention(
+            hidden_states_windows,
+            attn_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attention_windows = shifted_windows
+
+        was_padded = pad_values[2] > 0 or pad_values[3]
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :].contiguous()
+
+        attention_windows = attention_windows.view(batch_size, height * width, channels)
+
+        hidden_states = shortcut + self.drop_path(attention_windows)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class MaskFormerSwinLayer(nn.Module):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList(
+            [
+                MaskFormerSwinBlock(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self, hidden_states, input_dimensions, head_mask=None, output_attentions=False, output_hidden_states=False
+    ):
+        all_hidden_states = () if output_hidden_states else None
+
+        height, width = input_dimensions
+        for i, block_module in enumerate(self.blocks):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            block_hidden_states = block_module(
+                hidden_states,
+                input_dimensions,
+                layer_head_mask,
+                output_attentions,
+            )
+
+            hidden_states = block_hidden_states[0]
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(hidden_states, input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        return hidden_states, output_dimensions, all_hidden_states
+
+
+class MaskFormerSwinEncoder(nn.Module):
+    def __init__(self, config, grid_size):
+        super().__init__()
+        self.num_layers = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        self.layers = nn.ModuleList(
+            [
+                MaskFormerSwinLayer(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=MaskFormerSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        input_dimensions,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_input_dimensions = ()
+        all_self_attentions = () if output_attentions else None
+        # add the embebeddings
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_hidden_states, output_dimensions, layer_all_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module), hidden_states, layer_head_mask
+                )
+            else:
+                layer_hidden_states, output_dimensions, layer_all_hidden_states = layer_module(
+                    hidden_states,
+                    input_dimensions,
+                    layer_head_mask,
+                    output_attentions,
+                    output_hidden_states,
+                )
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+            all_input_dimensions += (input_dimensions,)
+            if output_hidden_states:
+                all_hidden_states += (layer_all_hidden_states,)
+
+            hidden_states = layer_hidden_states
+
+            if output_attentions:
+                # TODO no idea if that is correct
+                all_self_attentions = all_self_attentions + (layer_all_hidden_states[1],)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return MaskFormerSwinBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            hidden_states_spatial_dimensions=all_input_dimensions,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.swin.modeling_swin.MaskFormerSwinPreTrainedModel
+class MaskFormerSwinPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    maskformer models.
+    """
+
+    config_class = SwinConfig
+    base_model_prefix = "swin"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MaskFormerSwinEncoder):
+            module.gradient_checkpointing = value
+
+
+SWIN_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SWIN_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MaskFormerSwin Model transformer outputting raw hidden-states without any specific head on top.",
+    SWIN_START_DOCSTRING,
+)
+class MaskFormerSwinModel(MaskFormerSwinPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.num_layers = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
+
+        self.embeddings = MaskFormerSwinEmbeddings(config)
+        self.encoder = MaskFormerSwinEncoder(config, self.embeddings.patch_grid)
+
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskFormerSwinModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=[1, 49, 768],
+    )
+    def forward(
+        self,
+        pixel_values=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, len(self.config.depths))
+
+        embedding_output, input_dimensions = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = self.layernorm(sequence_output)
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        hidden_states_spatial_dimensions = (input_dimensions,) + encoder_outputs.hidden_states_spatial_dimensions
+
+        return MaskFormerSwinModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            hidden_states_spatial_dimensions=hidden_states_spatial_dimensions,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrAttention
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        key_value_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # add key-value position embeddings to the key value states
+        if key_value_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, key_value_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, bsz)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrDecoderLayer
+class DetrDecoderLayer(nn.Module):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = DetrAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=query_position_embeddings,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                position_embeddings=query_position_embeddings,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                key_value_position_embeddings=position_embeddings,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class DetrPreTrainedModel(PreTrainedModel):
+    config_class = DetrConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DetrDecoder):
+            module.gradient_checkpointing = value
+
+
+# Copied from transformers.models.detr.modeling_detr._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrDecoder
+class DetrDecoder(DetrPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some small tweaks for DETR:
+
+    - position_embeddings and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+    Args:
+        config: DetrConfig
+    """
+
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
+
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+
+        combined_attention_mask = None
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    position_embeddings=position_embeddings,
+                    query_position_embeddings=query_position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate]
+                if v is not None
+            )
+        return DetrDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+        )
+
+
 # refactored from original implementation
 class MaskFormerHungarianMatcher(nn.Module):
-    """This class computes an assignment between the labels and the predictions of the network
+    """This class computes an assignment between the labels and the predictions of the network.
 
     For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
     predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
@@ -475,7 +1815,7 @@ def __init__(
         pair of matched ground-truth / prediction (supervise class and mask)
 
         Args:
-            num_classes (int): The number of classes
+            num_classes (`int`): The number of classes
             matcher (MaskFormerHungarianMatcher):
                 A torch module that computes the assigments between the predictions and labels
             weight_dict (Dict[str, float]): A dictionary of weights to be applied to the different losses
@@ -496,20 +1836,32 @@ def __init__(
     def loss_labels(
         self, outputs: Dict[str, Tensor], labels: Dict[str, Tensor], indices: Tuple[np.array], num_masks: float
     ) -> Dict[str, Tensor]:
-        """Classification loss (NLL)
-        # TODO this doc was copied by the authors labels dicts must contain the key "labels" containing a tensor of dim
-        [nb_target_masks]
+        """Compute the losses related to the labels using cross entropy.
+
+        Args:
+            outputs (`Dict[str, Tensor]`):
+                A dict of tensors with the **class_queries_logits** key.
+            labels (`Dict[str, Tensor]`):
+                A dict of tensors with the **class_labels** key.
+            indices (`Tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+            num_masks (`int)`:
+                The number of masks, used for normalization.
+
+        Returns:
+            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
+            - **loss_cross_entropy** The loss computed using cross entropy on the predicted and ground truth labels.
         """
 
         pred_logits: Tensor = outputs["class_queries_logits"]
-        b, q, _ = pred_logits.shape
+        batch_size, num_queries, _ = pred_logits.shape
 
         idx = self._get_src_permutation_idx(indices)
         # shape = [BATCH, N_QUERIES]
         target_classes_o: Tensor = torch.cat([target[j] for target, (_, j) in zip(labels["class_labels"], indices)])
         # shape = [BATCH, N_QUERIES]
         target_classes: Tensor = torch.full(
-            (b, q), fill_value=self.num_classes, dtype=torch.int64, device=pred_logits.device
+            (batch_size, num_queries), fill_value=self.num_classes, dtype=torch.int64, device=pred_logits.device
         )
         target_classes[idx] = target_classes_o
         # target_classes is a [BATCH, CLASSES, N_QUERIES], we need to permute pred_logits "b q c -> b c q"
@@ -521,8 +1873,22 @@ def loss_labels(
     def loss_masks(
         self, outputs: Dict[str, Tensor], labels: Dict[str, Tensor], indices: Tuple[np.array], num_masks: int
     ) -> Dict[str, Tensor]:
-        """Compute the losses related to the masks: the focal loss and the dice loss.
-        labels dicts must contain the key "masks" containing a tensor of dim [nb_target_masks, h, w]
+        """Compute the losses related to the masks using focal and dice loss.
+
+        Args:
+            outputs (`Dict[str, Tensor]`):
+                A dict of tensors with the **masks_queries_logits** key.
+            labels (`Dict[str, Tensor]`):
+                A dict of tensors with the **mask_labels** key.
+            indices (`Tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+            num_masks (`int)`:
+                The number of masks, used for normalization.
+
+        Returns:
+            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing two keys:
+            - **loss_mask** The loss computed using sigmoid focal loss on the predicted and ground truth masks.
+            - **loss_dice** The loss computed using dice loss on the predicted on the predicted and ground truth masks.
         """
         src_idx = self._get_src_permutation_idx(indices)
         tgt_idx = self._get_tgt_permutation_idx(indices)
@@ -563,11 +1929,28 @@ def get_loss(self, loss, outputs, labels, indices, num_masks):
         return loss_map[loss](outputs, labels, indices, num_masks)
 
     def forward(self, outputs: Dict[str, Tensor], labels: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """This performs the loss computation.
-        Parameters:
-             outputs: dict of tensors, see the output specification of the model for the format
-             labels: list of dicts, such that len(labels) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        This performs the loss computation.
+
+        Args:
+            outputs (`Dict[str, Tensor]`):
+                A dict of `torch.Tensor` containing at least two keys:
+                - **class_queries_logits** The logits to be used for mask classification.
+                - **masks_queries_logits** The logits to be used for pixel classification.
+                if `use_auxilary_loss` was set to `true` in [`MaskFormerConfig`], then logits the dictionary also
+                contains **auxilary_predictions**.
+            labels (`Dict[str, Tensor]`):
+                A dict of `torch.Tensor` containing two keys:
+                - **class_labels** class labels.
+                - **mask_labels** mask labels.
+
+        Returns:
+            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing two keys:
+            - **loss_cross_entropy** The loss computed using cross entropy on the predicted and ground truth labels.
+            - **loss_mask** The loss computed using sigmoid focal loss on the predicted and ground truth masks.
+            - **loss_dice** The loss computed using dice loss on the predicted on the predicted and ground truth masks.
+            if `use_auxilary_loss` was set to `true` in [`MaskFormerConfig`], the dictionary contains addional losses
+            for each auxilary predictions.
         """
         outputs_without_aux = {
             "masks_queries_logits": outputs["masks_queries_logits"],
@@ -606,20 +1989,19 @@ def get_num_masks(self, labels: Dict[str, Tensor], device: torch.device) -> Numb
         return num_masks_clamped
 
 
-class SwinTransformerBackbone(nn.Module):
-    """This class uses [`SwinModel`] to reshape it's `hidden_states` from (`batch_size, sequence_length, hidden_size)` to
-    (`batch_size, num_channels, height, width)`).
+class MaskFormerSwinTransformerBackbone(nn.Module):
+    """
+    This class uses [`MaskFormerSwinModel`] to reshape its `hidden_states` from (`batch_size, sequence_length,
+    hidden_size)` to (`batch_size, num_channels, height, width)`).
 
         Args:
-            config (SwinConfig): The configuration used by [`SwinModel`]
+            config (`SwinConfig`): The configuration used by [`MaskFormerSwinModel`].
     """
 
     def __init__(self, config: SwinConfig):
         super().__init__()
-        self.model = SwinModel(config)
+        self.model = MaskFormerSwinModel(config)
         self.hidden_states_norms = nn.ModuleList([nn.LayerNorm(out_shape) for out_shape in self.outputs_shapes])
-        # little hack, our swin transformer has already the last norm, so let's switch the refence of the last item
-        # self.hidden_states_norms[-1] = self.model.layernorm
 
     def forward(self, *args, **kwargs) -> List[Tensor]:
         output = self.model(*args, **kwargs, output_hidden_states=True)
@@ -652,13 +2034,16 @@ def outputs_shapes(self) -> List[int]:
         return [layer.dim for layer in self.model.encoder.layers]
 
 
-class FPNConvLayer(nn.Sequential):
+class MaskFormerFPNConvLayer(nn.Sequential):
     def __init__(self, in_features: int, out_features: int, kernel_size: int = 3, padding: int = 1):
-        """A basic module that executs conv - norm - in sequence used in MaskFormer.
+        """
+        A basic module that executes conv - norm - in sequence used in MaskFormer.
 
         Args:
-            in_features (int): The number of input features (channels)
-            out_features (int): The number of outputs features (channels)
+            in_features (`int`):
+                The number of input features (channels).
+            out_features (`int`):
+                The number of outputs features (channels).
         """
         super().__init__(
             nn.Conv2d(in_features, out_features, kernel_size=kernel_size, padding=padding, bias=False),
@@ -667,14 +2052,17 @@ def __init__(self, in_features: int, out_features: int, kernel_size: int = 3, pa
         )
 
 
-class FPNLayer(nn.Module):
+class MaskFormerFPNLayer(nn.Module):
     def __init__(self, in_features: int, lateral_features: int):
-        """A Feature Pyramid Network Layer. It creates a feature map by aggregating features from the previous and backbone layer.
-        Due to the spartial mismatch, the tensor coming from the previous layer is upsample.
+        """
+        A Feature Pyramid Network Layer. It creates a feature map by aggregating features from the previous and
+        backbone layer. Due to the spatial mismatch, the tensor coming from the previous layer is upsampled.
 
         Args:
-            in_features (int): The number of input features (channels)
-            lateral_features (int): The number of lateral features (channels)
+            in_features (`int`):
+                The number of input features (channels).
+            lateral_features (`int`):
+                The number of lateral features (channels).
         """
         super().__init__()
         self.proj = nn.Sequential(
@@ -682,7 +2070,7 @@ def __init__(self, in_features: int, lateral_features: int):
             nn.GroupNorm(32, in_features),
         )
 
-        self.block = FPNConvLayer(in_features, in_features)
+        self.block = MaskFormerFPNConvLayer(in_features, in_features)
 
     def forward(self, down: Tensor, left: Tensor) -> Tensor:
         left = self.proj(left)
@@ -692,20 +2080,25 @@ def forward(self, down: Tensor, left: Tensor) -> Tensor:
         return down
 
 
-class FPNModel(nn.Module):
+class MaskFormerFPNModel(nn.Module):
     def __init__(self, in_features: int, lateral_widths: List[int], feature_size: int = 256):
-        """Feature Pyramid Network, given an input tensor and a set of features map of different feature/spatial size, it creates
-        a list of features map with different the same feature size.
+        """
+        Feature Pyramid Network, given an input tensor and a set of features map of different feature/spatial size, it
+        creates a list of features maps with the same feature size.
 
         Args:
-            in_features (int): The number of input features (channels)
-            lateral_widths (List[int]): A list with the features (channels) size of each lateral connection
-            feature_size (int, optional):
-                The features (channels) of the resulting feature maps. Defaults to 256.
+            in_features (`int`):
+                The number of input features (channels).
+            lateral_widths (List[int]):
+                A list with the features (channels) size of each lateral connection.
+            feature_size (int, *optional*, defaults to `256`):
+                The features (channels) of the resulting feature maps.
         """
         super().__init__()
-        self.stem = FPNConvLayer(in_features, feature_size)
-        self.layers = nn.Sequential(*[FPNLayer(feature_size, lateral_width) for lateral_width in lateral_widths[::-1]])
+        self.stem = MaskFormerFPNConvLayer(in_features, feature_size)
+        self.layers = nn.Sequential(
+            *[MaskFormerFPNLayer(feature_size, lateral_width) for lateral_width in lateral_widths[::-1]]
+        )
 
     def forward(self, features: List[Tensor]) -> List[Tensor]:
         fpn_features: List[Tensor] = []
@@ -720,17 +2113,19 @@ def forward(self, features: List[Tensor]) -> List[Tensor]:
 
 class MaskFormerPixelDecoder(nn.Module):
     def __init__(self, *args, feature_size: int = 256, mask_feature_size: int = 256, **kwargs):
-        """Pixel Decoder Module proposed in [Per-Pixel Classification is Not All You Need for Semantic
-        Segmentation](https://arxiv.org/abs/2107.06278). It first run the backbone's feature into a Feature Pyramid
-        Network creating a list of features map. Then, it projects the last one to the correct `mask_size`
+        """
+        Pixel Decoder Module proposed in [Per-Pixel Classification is Not All You Need for Semantic
+        Segmentation](https://arxiv.org/abs/2107.06278). It first runs the backbone's feature into a Feature Pyramid
+        Network creating a list of features maps. Then, it projects the last one to the correct `mask_size`.
 
         Args:
-            feature_size (int, optional): The features (channels) of FPN feature maps. Defaults to 256.
-            mask_feature_size (int, optional):
-                The features (channels) of the target masks size $C_{\epsilon}$ in the paper. Defaults to 256.
+            feature_size (int, *optional*, defaults to `256`):
+                The feature size (channel dimension) of the FPN feature maps.
+            mask_feature_size (int, *optional*, defaults to `256`):
+                The features (channels) of the target masks size $C_{\epsilon}$ in the paper.
         """
         super().__init__()
-        self.fpn = FPNModel(*args, feature_size=feature_size, **kwargs)
+        self.fpn = MaskFormerFPNModel(*args, feature_size=feature_size, **kwargs)
         self.mask_projection = nn.Conv2d(feature_size, mask_feature_size, kernel_size=3, padding=1)
 
     def forward(
@@ -745,7 +2140,7 @@ def forward(
 
 
 # copied and adapted from original implementation, also practically equal to DetrSinePositionEmbedding
-class PositionEmbeddingSine(nn.Module):
+class MaskFormerSinePositionEmbedding(nn.Module):
     """
     This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
     need paper, generalized to work on images.
@@ -786,13 +2181,18 @@ def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
 
 class MaskformerMLPPredictionHead(nn.Sequential):
     def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int = 3):
-        """A classic Multi Layer Perceptron (MLP)
+        """
+        A classic Multi Layer Perceptron (MLP).
 
         Args:
-            input_dim (int): The input dimensions
-            hidden_dim (int): The hidden dimensions
-            output_dim (int): The output dimensions
-            num_layers (int, optional): The number of layers. Defaults to 3.
+            input_dim (`int`):
+                The input dimensions
+            hidden_dim (`int`):
+                The hidden dimensions
+            output_dim (`int`):
+                The output dimensions
+            num_layers (int, *optional*, defaults to `3`):
+                The number of layers.
         """
         in_dims: List[int] = [input_dim] + [hidden_dim] * (num_layers - 1)
         out_dims: List[int] = [hidden_dim] * (num_layers - 1) + [output_dim]
@@ -810,11 +2210,16 @@ def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers:
 
 class MaskFormerPixelLevelModule(nn.Module):
     def __init__(self, config: MaskFormerConfig):
-        """Pixel Level Module proposed in [Per-Pixel Classification is Not All You Need for Semantic
+        """
+        Pixel Level Module proposed in [Per-Pixel Classification is Not All You Need for Semantic
         Segmentation](https://arxiv.org/abs/2107.06278). It runs the input image trough a backbone and a pixel decoder,
-        generating a image features and pixel embeddings."""
+        generating an image feature map and pixel embeddings.
+
+        Args:
+            config ([`MaskFormerConfig`]) The configuration used to instantiate this model.
+        """
         super().__init__()
-        self.encoder = SwinTransformerBackbone(config.backbone_config)
+        self.encoder = MaskFormerSwinTransformerBackbone(config.backbone_config)
         self.decoder = MaskFormerPixelDecoder(
             in_features=self.encoder.outputs_shapes[-1],
             feature_size=config.fpn_feature_size,
@@ -837,12 +2242,14 @@ def forward(
 
 
 class MaskFormerTransformerModule(nn.Module):
-    """The MaskFormer's transformer module."""
+    """
+    The MaskFormer's transformer module.
+    """
 
     def __init__(self, in_features: int, config: MaskFormerConfig):
         super().__init__()
         hidden_size: int = config.detr_config.hidden_size
-        self.position_embedder = PositionEmbeddingSine(num_pos_feats=hidden_size // 2, normalize=True)
+        self.position_embedder = MaskFormerSinePositionEmbedding(num_pos_feats=hidden_size // 2, normalize=True)
         self.queries_embedder = nn.Embedding(config.detr_config.num_queries, hidden_size)
         should_project = in_features != hidden_size
         self.input_projection = nn.Conv2d(in_features, hidden_size, kernel_size=1) if should_project else None
@@ -904,7 +2311,7 @@ def forward(self, image_features: Tensor, output_hidden_states: Optional[bool] =
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
-        
+
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~MaskFormerModelOutput`] instead of a plain tuple.
 """
@@ -916,20 +2323,19 @@ class MaskFormerPretrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
 
     def _init_weights(self, module: nn.Module):
-        std = self.config.init_std
         xavier_std = self.config.init_xavier_std
         if isinstance(module, MaskFormerTransformerModule):
             if module.input_projection is not None:
                 nn.init.xavier_uniform_(module.input_projection.weight, gain=xavier_std)
                 nn.init.constant_(module.input_projection.bias, 0)
         # FPN
-        elif isinstance(module, FPNModel):
+        elif isinstance(module, MaskFormerFPNModel):
             nn.init.xavier_uniform_(module.stem[0].weight, gain=xavier_std)
 
-        elif isinstance(module, FPNLayer):
+        elif isinstance(module, MaskFormerFPNLayer):
             nn.init.xavier_uniform_(module.proj[0].weight, gain=xavier_std)
 
-        elif isinstance(module, FPNConvLayer):
+        elif isinstance(module, MaskFormerFPNConvLayer):
             nn.init.xavier_uniform_(module[0].weight, gain=xavier_std)
         # The MLP head
         elif isinstance(module, MaskformerMLPPredictionHead):
@@ -966,7 +2372,7 @@ def forward(
         self,
         pixel_values: Tensor,
         pixel_mask: Optional[Tensor] = None,
-        output_hidden_states: Option[bool] = False,
+        output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> MaskFormerOutput:
 
@@ -1040,10 +2446,7 @@ def get_loss(self, loss_dict: Dict[str, Tensor]) -> Tensor:
         # probably an awkward way to reduce it
         return torch.tensor(list(loss_dict.values()), dtype=torch.float).sum()
 
-    def get_logits(
-        self,
-        outputs: MaskFormerOutput,
-    ) -> Tuple[Tensor, Tensor, List[str, Tensor]]:
+    def get_logits(self, outputs: MaskFormerOutput) -> Tuple[Tensor, Tensor, Dict[str, Tensor]]:
         pixel_embeddings: Tensor = outputs.pixel_decoder_last_hidden_state
         # get the auxilary predictions (one for each decoder's layer)
         auxilary_logits: List[str, Tensor] = []
@@ -1072,13 +2475,15 @@ def get_logits(
 
         return class_queries_logits, masks_queries_logits, auxilary_logits
 
+    @add_start_docstrings_to_model_forward(MASKFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MaskFormerForInstanceSegmentationOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: Tensor,
         mask_labels: Optional[Tensor] = None,
         class_labels: Optional[Tensor] = None,
         pixel_mask: Optional[Tensor] = None,
-        output_hidden_states: Option[bool] = False,
+        output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
     ) -> MaskFormerForInstanceSegmentationOutput:
         r"""
@@ -1093,30 +2498,33 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import MaskFormerFeatureExtractor, MaskFormerForObjectDetection
+        >>> from transformers import MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation
         >>> from PIL import Image
         >>> import requests
+
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> feature_extractor =  MaskFormerFeatureExtractor.from_pretrained("facebook/maskformer-swin-base-ade-640")
-        >>> model = MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-base-ade-640")
+        >>> feature_extractor = MaskFormerFeatureExtractor.from_pretrained("facebook/maskformer-swin-base-ade")
+        >>> model = MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-base-ade")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
-        >>> # model predicts
-        class_queries_logits of shape `(batch_size, num_queries)`
-        >>> # and masks_queries_logits of shape `(batch_size,
-        num_queries, height, width)`
+        >>> # model predicts class_queries_logits of shape `(batch_size, num_queries)`
+        >>> # and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
         >>> class_queries_logits = outputs.class_queries_logits
         >>> masks_queries_logits = outputs.masks_queries_logits
         >>> # you can pass them to feature_extractor for postprocessing
         >>> output = feature_extractor.post_process_segmentation(outputs)
-        >>> output = eature_extractor.post_process_panoptic_segmentation(outputs)
+        >>> output = feature_extractor.post_process_semantic_segmentation(outputs)
+        >>> output = feature_extractor.post_process_panoptic_segmentation(outputs)
+        ```
         """
 
         outputs: MaskFormerOutput = self.model(pixel_values, pixel_mask, output_hidden_states, return_dict)
 
         class_queries_logits, masks_queries_logits, auxilary_logits = self.get_logits(outputs)
 
+        loss, loss_dict, auxilary_logits = None, None, None
+
         we_have_labels: bool = mask_labels is not None and class_labels is not None
 
         if we_have_labels:
@@ -1141,6 +2549,6 @@ def forward(
             class_queries_logits=class_queries_logits,
             masks_queries_logits=masks_queries_logits,
             auxilary_logits=auxilary_logits,
-            loss_dict=loss_dict if we_have_labels else None,
-            loss=loss if we_have_labels else None,
+            loss_dict=loss_dict,
+            loss=loss,
         )
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index f98408f8e1e73f..59e41bd6948939 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -44,8 +44,8 @@
     TFWrappedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_mbart import MBartConfig
 
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 928e7e8b16199a..9b16c79f18e68c 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -51,8 +51,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_mobilebert import MobileBertConfig
 
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index 0ed54a2ab1cab9..196a47b1fb830b 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -47,8 +47,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_mpnet import MPNetConfig
 
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index a924fb40231cbc..cb680603a1df88 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -39,8 +39,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_openai import OpenAIGPTConfig
 
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index 86f922e7bbc67d..cb146874062466 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -45,8 +45,8 @@
     TFWrappedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_pegasus import PegasusConfig
 
diff --git a/src/transformers/models/poolformer/__init__.py b/src/transformers/models/poolformer/__init__.py
new file mode 100644
index 00000000000000..246dc7645596f4
--- /dev/null
+++ b/src/transformers/models/poolformer/__init__.py
@@ -0,0 +1,58 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_poolformer": ["POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PoolFormerConfig"],
+}
+
+if is_vision_available():
+    _import_structure["feature_extraction_poolformer"] = ["PoolFormerFeatureExtractor"]
+
+if is_torch_available():
+    _import_structure["modeling_poolformer"] = [
+        "POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "PoolFormerForImageClassification",
+        "PoolFormerModel",
+        "PoolFormerPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_poolformer import POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, PoolFormerConfig
+
+    if is_vision_available():
+        from .feature_extraction_poolformer import PoolFormerFeatureExtractor
+
+    if is_torch_available():
+        from .modeling_poolformer import (
+            POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PoolFormerForImageClassification,
+            PoolFormerModel,
+            PoolFormerPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/poolformer/configuration_poolformer.py b/src/transformers/models/poolformer/configuration_poolformer.py
new file mode 100644
index 00000000000000..d5cb07bd584fe7
--- /dev/null
+++ b/src/transformers/models/poolformer/configuration_poolformer.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2022 Sea AI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PoolFormer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "sail/poolformer_s12": "https://huggingface.co/sail/poolformer_s12/resolve/main/config.json",
+    # See all PoolFormer models at https://huggingface.co/models?filter=poolformer
+}
+
+
+class PoolFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of [`PoolFormerModel`]. It is used to instantiate a
+    PoolFormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the PoolFormer
+    [sail/poolformer_s12](https://huggingface.co/sail/poolformer_s12) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of channels in the input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size of the input patch.
+        stride (`int`, *optional*, defaults to 16):
+            The stride of the input patch.
+        pool_size (`int`, *optional*, defaults to 3):
+            The size of the pooling window.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            The ratio of the number of channels in the output of the MLP to the number of channels in the input.
+        depths (`list`, *optional*, defaults to `[2, 2, 6, 2]`):
+            The depth of each encoder block.
+        hidden_sizes (`list`, *optional*, defaults to `[64, 128, 320, 512]`):
+            The hidden sizes of each encoder block.
+        patch_sizes (`list`, *optional*, defaults to `[7, 3, 3, 3]`):
+            The size of the input patch for each encoder block.
+        strides (`list`, *optional*, defaults to `[4, 2, 2, 2]`):
+            The stride of the input patch for each encoder block.
+        padding (`list`, *optional*, defaults to `[2, 1, 1, 1]`):
+            The padding of the input patch for each encoder block.
+        num_encoder_blocks (`int`, *optional*, defaults to 4):
+            The number of encoder blocks.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The dropout rate for the dropout layers.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function for the hidden layers.
+        use_layer_scale (`bool`, *optional*, defaults to `True`):
+            Whether to use layer scale.
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
+            The initial value for the layer scale.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The initializer range for the weights.
+
+    Example:
+
+    ```python
+    >>> from transformers import PoolFormerModel, PoolFormerConfig
+
+    >>> # Initializing a PoolFormer sail/poolformer_s12 style configuration
+    >>> configuration = PoolFormerConfig()
+
+    >>> # Initializing a model from the sail/poolformer_s12 style configuration
+    >>> model = PoolFormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "poolformer"
+
+    def __init__(
+        self,
+        num_channels=3,
+        patch_size=16,
+        stride=16,
+        pool_size=3,
+        mlp_ratio=4.0,
+        depths=[2, 2, 6, 2],
+        hidden_sizes=[64, 128, 320, 512],
+        patch_sizes=[7, 3, 3, 3],
+        strides=[4, 2, 2, 2],
+        padding=[2, 1, 1, 1],
+        num_encoder_blocks=4,
+        drop_path_rate=0.0,
+        hidden_act="gelu",
+        use_layer_scale=True,
+        layer_scale_init_value=1e-5,
+        initializer_range=0.02,
+        **kwargs
+    ):
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.stride = stride
+        self.padding = padding
+        self.pool_size = pool_size
+        self.hidden_sizes = hidden_sizes
+        self.mlp_ratio = mlp_ratio
+        self.depths = depths
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.num_encoder_blocks = num_encoder_blocks
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_layer_scale = use_layer_scale
+        self.layer_scale_init_value = layer_scale_init_value
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
new file mode 100644
index 00000000000000..eebc8b0c5e713d
--- /dev/null
+++ b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert PoolFormer checkpoints from the original repository. URL: https://github.com/sail-sg/poolformer"""
+
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from huggingface_hub import cached_download, hf_hub_url
+from transformers import PoolFormerConfig, PoolFormerFeatureExtractor, PoolFormerForImageClassification
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def replace_key_with_offset(key, offset, original_name, new_name):
+    """
+    Replaces the key by subtracting the offset from the original layer number
+    """
+    to_find = original_name.split(".")[0]
+    key_list = key.split(".")
+    orig_block_num = int(key_list[key_list.index(to_find) - 2])
+    layer_num = int(key_list[key_list.index(to_find) - 1])
+    new_block_num = orig_block_num - offset
+
+    key = key.replace(f"{orig_block_num}.{layer_num}.{original_name}", f"block.{new_block_num}.{layer_num}.{new_name}")
+    return key
+
+
+def rename_keys(state_dict):
+    new_state_dict = OrderedDict()
+    total_embed_found, patch_emb_offset = 0, 0
+    for key, value in state_dict.items():
+        if key.startswith("network"):
+            key = key.replace("network", "poolformer.encoder")
+        if "proj" in key:
+            # Works for the first embedding as well as the internal embedding layers
+            if key.endswith("bias") and "patch_embed" not in key:
+                patch_emb_offset += 1
+            to_replace = key[: key.find("proj")]
+            key = key.replace(to_replace, f"patch_embeddings.{total_embed_found}.")
+            key = key.replace("proj", "projection")
+            if key.endswith("bias"):
+                total_embed_found += 1
+        if "patch_embeddings" in key:
+            key = "poolformer.encoder." + key
+        if "mlp.fc1" in key:
+            key = replace_key_with_offset(key, patch_emb_offset, "mlp.fc1", "output.conv1")
+        if "mlp.fc2" in key:
+            key = replace_key_with_offset(key, patch_emb_offset, "mlp.fc2", "output.conv2")
+        if "norm1" in key:
+            key = replace_key_with_offset(key, patch_emb_offset, "norm1", "before_norm")
+        if "norm2" in key:
+            key = replace_key_with_offset(key, patch_emb_offset, "norm2", "after_norm")
+        if "layer_scale_1" in key:
+            key = replace_key_with_offset(key, patch_emb_offset, "layer_scale_1", "layer_scale_1")
+        if "layer_scale_2" in key:
+            key = replace_key_with_offset(key, patch_emb_offset, "layer_scale_2", "layer_scale_2")
+        if "head" in key:
+            key = key.replace("head", "classifier")
+        new_state_dict[key] = value
+    return new_state_dict
+
+
+# We will verify our results on a COCO image
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    return image
+
+
+@torch.no_grad()
+def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our PoolFormer structure.
+    """
+
+    # load default PoolFormer configuration
+    config = PoolFormerConfig()
+
+    # set attributes based on model_name
+    repo_id = "datasets/huggingface/label-files"
+    size = model_name[-3:]
+    config.num_labels = 1000
+    filename = "imagenet-1k-id2label.json"
+    expected_shape = (1, 1000)
+
+    # set config attributes
+    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+    if size == "s12":
+        config.depths = [2, 2, 6, 2]
+        config.hidden_sizes = [64, 128, 320, 512]
+        config.mlp_ratio = 4.0
+        crop_pct = 0.9
+    elif size == "s24":
+        config.depths = [4, 4, 12, 4]
+        config.hidden_sizes = [64, 128, 320, 512]
+        config.mlp_ratio = 4.0
+        crop_pct = 0.9
+    elif size == "s36":
+        config.depths = [6, 6, 18, 6]
+        config.hidden_sizes = [64, 128, 320, 512]
+        config.mlp_ratio = 4.0
+        config.layer_scale_init_value = 1e-6
+        crop_pct = 0.9
+    elif size == "m36":
+        config.depths = [6, 6, 18, 6]
+        config.hidden_sizes = [96, 192, 384, 768]
+        config.mlp_ratio = 4.0
+        config.layer_scale_init_value = 1e-6
+        crop_pct = 0.95
+    elif size == "m48":
+        config.depths = [8, 8, 24, 8]
+        config.hidden_sizes = [96, 192, 384, 768]
+        config.mlp_ratio = 4.0
+        config.layer_scale_init_value = 1e-6
+        crop_pct = 0.95
+    else:
+        raise ValueError(f"Size {size} not supported")
+
+    # load feature extractor
+    feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct)
+
+    # Prepare image
+    image = prepare_img()
+    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+
+    logger.info(f"Converting model {model_name}...")
+
+    # load original state dict
+    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+
+    # rename keys
+    state_dict = rename_keys(state_dict)
+
+    # create HuggingFace model and load state dict
+    model = PoolFormerForImageClassification(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    # Define feature extractor
+    feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct)
+    pixel_values = feature_extractor(images=prepare_img(), return_tensors="pt").pixel_values
+
+    # forward pass
+    outputs = model(pixel_values)
+    logits = outputs.logits
+
+    # define expected logit slices for different models
+    if size == "s12":
+        expected_slice = torch.tensor([-0.3045, -0.6758, -0.4869])
+    elif size == "s24":
+        expected_slice = torch.tensor([0.4402, -0.1374, -0.8045])
+    elif size == "s36":
+        expected_slice = torch.tensor([-0.6080, -0.5133, -0.5898])
+    elif size == "m36":
+        expected_slice = torch.tensor([0.3952, 0.2263, -1.2668])
+    elif size == "m48":
+        expected_slice = torch.tensor([0.1167, -0.0656, -0.3423])
+    else:
+        raise ValueError(f"Size {size} not supported")
+
+    # verify logits
+    assert logits.shape == expected_shape
+    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-2)
+
+    # finally, save model and feature extractor
+    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name",
+        default="poolformer_s12",
+        type=str,
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, help="Path to the original PyTorch checkpoint (.pth file)."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_poolformer_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/poolformer/feature_extraction_poolformer.py b/src/transformers/models/poolformer/feature_extraction_poolformer.py
new file mode 100644
index 00000000000000..b7d44e22265193
--- /dev/null
+++ b/src/transformers/models/poolformer/feature_extraction_poolformer.py
@@ -0,0 +1,172 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for PoolFormer."""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...file_utils import TensorType
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ImageFeatureExtractionMixin,
+    ImageInput,
+    is_torch_tensor,
+)
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PoolFormerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a PoolFormer feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize_and_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to resize the shortest edge of the image and center crop the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
+            Center crop the input to the given size. If a tuple is provided, it should be (width, height). If only an
+            integer is provided, then the input will be center cropped to (size, size). Only has an effect if
+            `do_resize_and_center_crop` is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
+            if `do_resize_and_center_crop` is set to `True`.
+        crop_pct (`float`, *optional*, defaults to `0.9`):
+            The percentage of the image to crop from the center. Only has an effect if `do_resize_and_center_crop` is
+            set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with `image_mean` and `image_std`.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize_and_center_crop=True,
+        size=224,
+        resample=Image.BICUBIC,
+        crop_pct=0.9,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.do_resize_and_center_crop = do_resize_and_center_crop
+        self.size = size
+        self.resample = resample
+        self.crop_pct = crop_pct
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+
+    def __call__(
+        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s).
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+        """
+        # Input type checking for clearer error
+        valid_images = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        if not is_batched:
+            images = [images]
+
+        # transformations (resizing + center cropping + normalization)
+        if self.do_resize_and_center_crop and self.size is not None and self.crop_pct is not None:
+            if isinstance(self.size, (tuple, list)):
+                assert len(self.size) == 2
+                if self.size[-1] == self.size[-2]:
+                    scale_size = int(math.floor(self.size[0] / self.crop_pct))
+                else:
+                    scale_size = tuple([int(x / self.crop_pct) for x in self.size])
+            else:
+                scale_size = int(math.floor(self.size / self.crop_pct))
+
+            # resize shortest edge of the image
+            images = [
+                self.resize(image=image, size=scale_size, resample=self.resample, default_to_square=False)
+                for image in images
+            ]
+            # center crop
+            images = [self.center_crop(image, size=self.size) for image in images]
+
+        if self.do_normalize:
+            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
+
+        # return as BatchFeature
+        data = {"pixel_values": images}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py
new file mode 100755
index 00000000000000..17205e31124728
--- /dev/null
+++ b/src/transformers/models/poolformer/modeling_poolformer.py
@@ -0,0 +1,499 @@
+# coding=utf-8
+# Copyright 2022 Sea AI Lab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch PoolFormer model."""
+
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_poolformer import PoolFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "PoolFormerConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "PoolFormerFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "sail/poolformer_s12"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "sail/poolformer_s12"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'"
+
+POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "sail/poolformer_s12",
+    # See all PoolFormer models at https://huggingface.co/models?filter=poolformer
+]
+
+
+# Copied from transformers.models.vit.modeling_vit.to_2tuple
+def to_2tuple(x):
+    if isinstance(x, collections.abc.Iterable):
+        return x
+    return (x, x)
+
+
+@dataclass
+class PoolFormerModelOutput(ModelOutput):
+    """
+    Class for PoolFormerModel's outputs, with potential hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class PoolFormerClassifierOutput(ModelOutput):
+    """
+    Class for PoolformerForImageClassification's outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is
+    misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion:
+    https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the layer and
+    argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class PoolFormerDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class PoolFormerEmbeddings(nn.Module):
+    """
+    Construct Patch Embeddings.
+    """
+
+    def __init__(self, hidden_size, num_channels, patch_size, stride, padding, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=stride, padding=padding)
+        self.norm = norm_layer(hidden_size) if norm_layer else nn.Identity()
+
+    def forward(self, pixel_values):
+        x = self.projection(pixel_values)
+        x = self.norm(x)
+        return x
+
+
+class PoolFormerGroupNorm(nn.GroupNorm):
+    """
+    Group Normalization with 1 group. Input: tensor in shape [B, C, H, W]
+    """
+
+    def __init__(self, num_channels, **kwargs):
+        super().__init__(1, num_channels, **kwargs)
+
+
+class PoolFormerPooling(nn.Module):
+    def __init__(self, pool_size):
+        super().__init__()
+        self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
+
+    def forward(self, hidden_states):
+        return self.pool(hidden_states) - hidden_states
+
+
+class PoolFormerOutput(nn.Module):
+    def __init__(self, config, dropout_prob, hidden_size, intermediate_size):
+        super().__init__()
+        self.conv1 = nn.Conv2d(hidden_size, intermediate_size, 1)
+        self.conv2 = nn.Conv2d(intermediate_size, hidden_size, 1)
+        self.drop = PoolFormerDropPath(dropout_prob)
+        if isinstance(config.hidden_act, str):
+            self.act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.drop(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.drop(hidden_states)
+
+        return hidden_states
+
+
+class PoolFormerLayer(nn.Module):
+    """This corresponds to the 'PoolFormerBlock' class in the original implementation."""
+
+    def __init__(self, config, num_channels, pool_size, hidden_size, intermediate_size, drop_path):
+        super().__init__()
+        self.pooling = PoolFormerPooling(pool_size)
+        self.output = PoolFormerOutput(config, drop_path, hidden_size, intermediate_size)
+        self.before_norm = PoolFormerGroupNorm(num_channels)
+        self.after_norm = PoolFormerGroupNorm(num_channels)
+
+        # Useful for training neural nets
+        self.drop_path = PoolFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = config.use_layer_scale
+        if config.use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(
+                config.layer_scale_init_value * torch.ones((num_channels)), requires_grad=True
+            )
+            self.layer_scale_2 = nn.Parameter(
+                config.layer_scale_init_value * torch.ones((num_channels)), requires_grad=True
+            )
+
+    def forward(self, hidden_states):
+        if self.use_layer_scale:
+            pooling_output = self.pooling(self.before_norm(hidden_states))
+            scaled_op = self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * pooling_output
+            # First residual connection
+            hidden_states = hidden_states + self.drop_path(scaled_op)
+            outputs = ()
+
+            layer_output = self.output(self.after_norm(hidden_states))
+            scaled_op = self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * layer_output
+            # Second residual connection
+            output = hidden_states + self.drop_path(scaled_op)
+
+            outputs = (output,) + outputs
+            return outputs
+
+        else:
+            pooling_output = self.drop_path(self.pooling(self.before_norm(hidden_states)))
+            # First residual connection
+            hidden_states = pooling_output + hidden_states
+            outputs = ()
+
+            # Second residual connection inside the PoolFormerOutput block
+            layer_output = self.drop_path(self.output(self.after_norm(hidden_states)))
+            output = hidden_states + layer_output
+
+            outputs = (output,) + outputs
+            return outputs
+
+
+class PoolFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+
+        # patch embeddings
+        embeddings = []
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                PoolFormerEmbeddings(
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    padding=config.padding[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                )
+            )
+        self.patch_embeddings = nn.ModuleList(embeddings)
+
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    PoolFormerLayer(
+                        config,
+                        num_channels=config.hidden_sizes[i],
+                        pool_size=config.pool_size,
+                        hidden_size=config.hidden_sizes[i],
+                        intermediate_size=int(config.hidden_sizes[i] * config.mlp_ratio),
+                        drop_path=dpr[cur + j],
+                    )
+                )
+            blocks.append(nn.ModuleList(layers))
+
+        self.block = nn.ModuleList(blocks)
+
+    def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+
+        hidden_states = pixel_values
+        for idx, layers in enumerate(zip(self.patch_embeddings, self.block)):
+            embedding_layer, block_layer = layers
+            # Get patch embeddings from hidden_states
+            hidden_states = embedding_layer(hidden_states)
+            # Send the embeddings through the blocks
+            for i, blk in enumerate(block_layer):
+                layer_outputs = blk(hidden_states)
+                hidden_states = layer_outputs[0]
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return PoolFormerModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
+
+
+class PoolFormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PoolFormerConfig
+    base_model_prefix = "poolformer"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, PoolFormerEncoder):
+            module.gradient_checkpointing = value
+
+
+POOLFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`PoolFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+POOLFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`PoolFormerFeatureExtractor`]. See
+            [`PoolFormerFeatureExtractor.__call__`] for details.
+"""
+
+
+@add_start_docstrings(
+    "The bare PoolFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    POOLFORMER_START_DOCSTRING,
+)
+class PoolFormerModel(PoolFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = PoolFormerEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=PoolFormerModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(self, pixel_values=None, output_hidden_states=None, return_dict=None):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output, None) + encoder_outputs[1:]
+
+        return PoolFormerModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+class PoolFormerFinalPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+
+    def forward(self, hidden_states):
+        output = self.dense(hidden_states)
+        return output
+
+
+@add_start_docstrings(
+    """
+    PoolFormer Model transformer with an image classification head on top
+    """,
+    POOLFORMER_START_DOCSTRING,
+)
+class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.poolformer = PoolFormerModel(config)
+
+        # Final norm
+        self.norm = PoolFormerGroupNorm(config.hidden_sizes[-1])
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=PoolFormerClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values=None,
+        labels=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.poolformer(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(self.norm(sequence_output).mean([-2, -1]))
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return PoolFormerClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 4059b09cd80bc7..7ea2d3521b6117 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -1269,6 +1269,8 @@ def generate(
         )
 
         if return_dict_in_generate:
+            # TODO(Patrick): `encoder_outputs`, `past` hack.
+            # Remove after cleaning encoder-decoder outputs
             if output_attentions:
                 model_kwargs["encoder_attentions"] = encoder_outputs.attentions
             if output_hidden_states:
@@ -1350,28 +1352,35 @@ def extend_enc_output(tensor, num_beams=None):
                 **model_kwargs,  # encoder_outputs is here as in Pytorch's version
             )
         else:
-            return self._generate_no_beam_search(
-                decoder_input_ids,
-                cur_len=cur_len,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=do_sample,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
+            pre_processor = self._get_logits_processor(
                 repetition_penalty=repetition_penalty,
                 no_repeat_ngram_size=no_repeat_ngram_size,
                 bad_words_ids=bad_words_ids,
+                min_length=min_length,
+                eos_token_id=eos_token_id,
+            )
+            # TODO(Patrick) clean-up once generate is fully cleaned up
+            model_kwargs["attention_mask"] = context_attention_mask
+            # TODO(Patrick) remove once generate is fully cleaned up
+            model_kwargs.pop("output_hidden_states", None)
+            model_kwargs.pop("output_attentions", None)
+            model_kwargs.pop("output_scores", None)
+
+            # TODO(Patrick): `encoder_outputs`, `past` hack.
+            # Remove after cleaning encoder-decoder outputs
+            model_kwargs["past"] = encoder_outputs
+
+            return self.greedy_search(
+                input_ids=decoder_input_ids,
+                max_length=max_length,
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
-                batch_size=batch_size,
-                vocab_size=vocab_size,
-                attention_mask=context_attention_mask,
-                use_cache=use_cache,
-                forced_bos_token_id=None,
-                forced_eos_token_id=None,
+                logits_processor=pre_processor,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                output_scores=output_scores,
                 return_dict_in_generate=return_dict_in_generate,
-                **model_kwargs,  # encoder_outputs is here as in Pytorch's version
+                **model_kwargs,
             )
 
     def get_input_embeddings(self):
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index 9bf6ba6edeebc1..24a6387cd7c3b1 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -51,8 +51,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_rembert import RemBertConfig
 
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index 9aeb0a1eef58d6..b74863fb20793f 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -52,8 +52,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_roberta import RobertaConfig
 
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 57a40a29058721..393114df01ff31 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -51,8 +51,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_roformer import RoFormerConfig
 
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 7c69684e06112d..0eba94521d2538 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -39,8 +39,8 @@
     TFSharedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_speech_to_text import Speech2TextConfig
 
diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py
index 862c03f618773e..7b70c2692631d8 100644
--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -90,6 +90,10 @@ class SwinConfig(PretrainedConfig):
     ```"""
     model_type = "swin"
 
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+    }
+
     def __init__(
         self,
         image_size=224,
@@ -130,3 +134,6 @@ def __init__(
         self.path_norm = patch_norm
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = embed_dim * 8
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index fc88bf4fb21db4..226675064738c3 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -17,8 +17,6 @@
 
 import collections.abc
 import math
-from dataclasses import dataclass
-from typing import Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
@@ -26,13 +24,8 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from ...modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
 from .configuration_swin import SwinConfig
@@ -59,80 +52,6 @@
 ]
 
 
-@dataclass
-class SwinModelOutputWithPooling(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token) after further processing
-            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
-            the classification token after processing through a linear layer and a tanh activation function. The linear
-            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*, a tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to `batch, channels, height, width`. Due to padding, their spatial size cannot inferred before the `forward` method:
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    hidden_states_spatial_dimensions: Tuple[Tuple[int, int]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class SwinBaseModelOutput(ModelOutput):
-    """
-    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-
-            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
-            hidden_size)` is output.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
-            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
-            encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
-            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
-            input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        hidden_states_spatial_dimensions (`tuple(tuple(int, int))`, *optional*, a tuple containing the spatial dimension of each `hidden_state` needed to reshape the `hidden_states` to `batch, channels, height, width`. Due to padding, their spatial size cannot inferred before the `forward` method:
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    hidden_states_spatial_dimensions: Tuple[Tuple[int, int]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
 # to_2tuple, drop_path, SwinPatchEmbeddings, SwinPatchMerging and SwinDropPath are from the timm library.
 
 
@@ -205,7 +124,7 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values):
-        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        embeddings = self.patch_embeddings(pixel_values)
         embeddings = self.norm(embeddings)
 
         if self.position_embeddings is not None:
@@ -213,7 +132,7 @@ def forward(self, pixel_values):
 
         embeddings = self.dropout(embeddings)
 
-        return embeddings, output_dimensions
+        return embeddings
 
 
 class SwinPatchEmbeddings(nn.Module):
@@ -233,25 +152,9 @@ def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768)
 
         self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
 
-    def maybe_pad(self, pixel_values, width, height):
-        if width % self.patch_size[1] != 0:
-            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
-            pixel_values = nn.functional.pad(pixel_values, pad_values)
-        if height % self.patch_size[0] != 0:
-            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
-            pixel_values = nn.functional.pad(pixel_values, pad_values)
-        return pixel_values
-
     def forward(self, pixel_values):
-        _, _, height, width = pixel_values.shape
-        # pad the input to be divisible by self.patch_size, if needed
-        pixel_values = self.maybe_pad(pixel_values, height, width)
-        embeddings = self.projection(pixel_values)
-        _, _, height, width = embeddings.shape
-        output_dimensions = (height, width)
-        embeddings_flat = embeddings.flatten(2).transpose(1, 2)
-
-        return embeddings_flat, output_dimensions
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
 
 
 class SwinPatchMerging(nn.Module):
@@ -269,26 +172,17 @@ class SwinPatchMerging(nn.Module):
 
     def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
         super().__init__()
+        self.input_resolution = input_resolution
         self.dim = dim
         self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
         self.norm = norm_layer(4 * dim)
 
-    def maybe_pad(self, input_feature, width, height):
-        should_pad = (height % 2 == 1) or (width % 2 == 1)
-        if should_pad:
-            pad_values = (0, 0, 0, width % 2, 0, height % 2)
-            input_feature = nn.functional.pad(input_feature, pad_values)
-
-        return input_feature
-
-    def forward(self, input_feature, input_dimensions):
-        height, width = input_dimensions
+    def forward(self, input_feature):
+        height, width = self.input_resolution
         # `dim` is height * width
         batch_size, dim, num_channels = input_feature.shape
 
         input_feature = input_feature.view(batch_size, height, width, num_channels)
-        # pad input to be disible by width and height, if needed
-        input_feature = self.maybe_pad(input_feature, height, width)
 
         input_feature_0 = input_feature[:, 0::2, 0::2, :]  # batch_size height/2 width/2 num_channels
         input_feature_1 = input_feature[:, 1::2, 0::2, :]  # batch_size height/2 width/2 num_channels
@@ -493,12 +387,11 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         self.shift_size = shift_size
         self.window_size = config.window_size
         self.input_resolution = input_resolution
-        # # TODO check this in the original implementation
-        # if min(self.input_resolution) <= self.window_size:
-        #     # if window size is larger than input resolution, we don't partition windows
-        #     self.shift_size = 0
-        #     self.window_size = min(self.input_resolution)
-        #     print("asddsadsa", dim, self.input_resolution)
+
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
 
         self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.attention = SwinAttention(config, dim, num_heads)
@@ -507,10 +400,9 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
         self.intermediate = SwinIntermediate(config, dim)
         self.output = SwinOutput(config, dim)
 
-    def get_attn_mask(self, input_resolution):
         if self.shift_size > 0:
             # calculate attention mask for SW-MSA
-            height, width = input_resolution
+            height, width = self.input_resolution
             img_mask = torch.zeros((1, height, width, 1))
             height_slices = (
                 slice(0, -self.window_size),
@@ -534,27 +426,17 @@ def get_attn_mask(self, input_resolution):
             attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
         else:
             attn_mask = None
-        return attn_mask
-
-    def maybe_pad(self, hidden_states, height, width):
-        pad_left = pad_top = 0
-        pad_rigth = (self.window_size - width % self.window_size) % self.window_size
-        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
-        pad_values = (0, 0, pad_left, pad_rigth, pad_top, pad_bottom)
-        hidden_states = nn.functional.pad(hidden_states, pad_values)
-        return hidden_states, pad_values
-
-    def forward(self, hidden_states, input_dimensions, head_mask=None, output_attentions=False):
-        height, width = input_dimensions
+
+        self.attn_mask = attn_mask
+
+    def forward(self, hidden_states, head_mask=None, output_attentions=False):
+        height, width = self.input_resolution
         batch_size, dim, channels = hidden_states.size()
         shortcut = hidden_states
 
         hidden_states = self.layernorm_before(hidden_states)
         hidden_states = hidden_states.view(batch_size, height, width, channels)
-        # pad hidden_states to multiples of window size
-        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
 
-        _, height_pad, width_pad, _ = hidden_states.shape
         # cyclic shift
         if self.shift_size > 0:
             shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
@@ -564,13 +446,13 @@ def forward(self, hidden_states, input_dimensions, head_mask=None, output_attent
         # partition windows
         hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
         hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
-        attn_mask = self.get_attn_mask((height_pad, width_pad))
-        if attn_mask is not None:
-            attn_mask = attn_mask.to(hidden_states_windows.device)
+
+        if self.attn_mask is not None:
+            self.attn_mask = self.attn_mask.to(hidden_states_windows.device)
 
         self_attention_outputs = self.attention(
             hidden_states_windows,
-            attn_mask,
+            self.attn_mask,
             head_mask,
             output_attentions=output_attentions,
         )
@@ -580,7 +462,7 @@ def forward(self, hidden_states, input_dimensions, head_mask=None, output_attent
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
         attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
-        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)  # B H' W' C
+        shifted_windows = window_reverse(attention_windows, self.window_size, height, width)  # B H' W' C
 
         # reverse cyclic shift
         if self.shift_size > 0:
@@ -588,10 +470,6 @@ def forward(self, hidden_states, input_dimensions, head_mask=None, output_attent
         else:
             attention_windows = shifted_windows
 
-        was_padded = pad_values[2] > 0 or pad_values[3]
-        if was_padded:
-            attention_windows = attention_windows[:, :height, :width, :].contiguous()
-
         attention_windows = attention_windows.view(batch_size, height * width, channels)
 
         hidden_states = shortcut + self.drop_path(attention_windows)
@@ -631,38 +509,29 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
 
         self.pointing = False
 
-    def forward(
-        self, hidden_states, input_dimensions, head_mask=None, output_attentions=False, output_hidden_states=False
-    ):
+    def forward(self, hidden_states, head_mask=None, output_attentions=False, output_hidden_states=False):
         all_hidden_states = () if output_hidden_states else None
 
-        height, width = input_dimensions
         for i, block_module in enumerate(self.blocks):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
-            block_hidden_states = block_module(
+            layer_outputs = block_module(
                 hidden_states,
-                input_dimensions,
                 layer_head_mask,
                 output_attentions,
             )
 
-            hidden_states = block_hidden_states[0]
-
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
+            hidden_states = layer_outputs[0]
 
         if self.downsample is not None:
-            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
-            output_dimensions = (height, width, height_downsampled, width_downsampled)
-            hidden_states = self.downsample(hidden_states, input_dimensions)
-        else:
-            output_dimensions = (height, width, height, width)
+            layer_outputs_list = list(layer_outputs)
+            layer_outputs_list[0] = self.downsample(layer_outputs[0])
+            layer_outputs = tuple(layer_outputs_list)
 
-        return hidden_states, output_dimensions, all_hidden_states
+        return layer_outputs
 
 
 class SwinEncoder(nn.Module):
@@ -691,20 +560,18 @@ def __init__(self, config, grid_size):
     def forward(
         self,
         hidden_states,
-        input_dimensions,
         head_mask=None,
         output_attentions=False,
         output_hidden_states=False,
         return_dict=True,
     ):
         all_hidden_states = () if output_hidden_states else None
-        all_input_dimensions = ()
         all_self_attentions = () if output_attentions else None
-        # add the embebeddings
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
 
         for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
             if self.gradient_checkpointing and self.training:
@@ -715,37 +582,24 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_hidden_states, output_dimensions, layer_all_hidden_states = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module), hidden_states, layer_head_mask
                 )
             else:
-                layer_hidden_states, output_dimensions, layer_all_hidden_states = layer_module(
-                    hidden_states,
-                    input_dimensions,
-                    layer_head_mask,
-                    output_attentions,
-                    output_hidden_states,
-                )
-
-            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
-            all_input_dimensions += (input_dimensions,)
-            if output_hidden_states:
-                all_hidden_states += (layer_all_hidden_states,)
-
-            hidden_states = layer_hidden_states
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
 
+            hidden_states = layer_outputs[0]
             if output_attentions:
-                # TODO no idea if that is correct
-                all_self_attentions = all_self_attentions + (layer_all_hidden_states[1],)
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
             return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
 
-        return SwinBaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            hidden_states_spatial_dimensions=all_input_dimensions,
-            attentions=all_self_attentions,
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
         )
 
 
@@ -842,7 +696,14 @@ class PreTrainedModel
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
-    # @replace_return_docstrings(output_type=SwinModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
     def forward(
         self,
         pixel_values=None,
@@ -867,18 +728,17 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, len(self.config.depths))
 
-        embedding_output, input_dimensions = self.embeddings(pixel_values)
+        embedding_output = self.embeddings(pixel_values)
 
         encoder_outputs = self.encoder(
             embedding_output,
-            input_dimensions,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
-        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = encoder_outputs[0]
         sequence_output = self.layernorm(sequence_output)
 
         pooled_output = None
@@ -889,13 +749,10 @@ def forward(
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
 
-        hidden_states_spatial_dimensions = (input_dimensions,) + encoder_outputs.hidden_states_spatial_dimensions
-
-        return SwinModelOutputWithPooling(
+        return BaseModelOutputWithPooling(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
-            hidden_states_spatial_dimensions=hidden_states_spatial_dimensions,
             attentions=encoder_outputs.attentions,
         )
 
@@ -923,13 +780,13 @@ def __init__(self, config):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
-    # @add_code_sample_docstrings(
-    #     processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-    #     checkpoint=_IMAGE_CLASS_CHECKPOINT,
-    #     output_type=SequenceClassifierOutput,
-    #     config_class=_CONFIG_FOR_DOC,
-    #     expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
-    # )
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
     def forward(
         self,
         pixel_values=None,
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 3af2a53de261fa..0c211caccfdebd 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -237,14 +237,19 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 class T5LayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-        # layer norm should always be calculated in float32
+
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
 
@@ -255,6 +260,20 @@ def forward(self, hidden_states):
         return self.weight * hidden_states
 
 
+try:
+    from apex.normalization import FusedRMSNorm
+
+    T5LayerNorm = FusedRMSNorm  # noqa
+
+    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm")
+except ImportError:
+    # using the normal T5LayerNorm
+    pass
+except Exception:
+    logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
+    pass
+
+
 class T5DenseReluDense(nn.Module):
     def __init__(self, config):
         super().__init__()
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index 5b030342ff5649..ca307df70ebcce 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -44,8 +44,8 @@
     TFWrappedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_t5 import T5Config
 
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index cdb7e8c113e961..46baba2627983d 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -45,8 +45,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_tapas import TapasConfig
 
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index ab8fb6f11b73f0..4534a4884aa76d 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -34,8 +34,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_transfo_xl import TransfoXLConfig
 from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
@@ -597,14 +597,8 @@ def call(
         mlen = shape_list(inputs["mems"][0])[0] if inputs["mems"] is not None else 0
         klen = mlen + qlen
 
-        attn_mask = tf.ones([qlen, qlen])
-        mask_u = tf.linalg.band_part(attn_mask, 0, -1)
-        mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
-        attn_mask_pad = tf.zeros([qlen, mlen])
-        dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
-        if self.same_length:
-            mask_l = tf.linalg.band_part(attn_mask, -1, 0)
-            dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia, dec_attn_mask[:, qlen:]], 1)
+        # Compute decoder attention mask
+
         # ::: PyTorch masking code for reference :::
         # if self.same_length:
         #     all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
@@ -619,6 +613,21 @@ def call(
         #     dec_attn_mask = torch.triu(
         #         word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
 
+        # TensorFlow version
+        dec_attn_mask = 1 - tf.linalg.band_part(
+            tf.ones([qlen, klen], dtype=tf.int32), -1, mlen
+        )  # (q, q): diagonal with 1's
+        if self.same_length:
+            mask_len = klen - self.mem_len
+            if mask_len > 0:
+                mask_shift_len = qlen - mask_len
+            else:
+                mask_shift_len = qlen
+            if mask_shift_len >= 1:
+                dec_attn_mask += 1 - tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), mask_shift_len - 1, -1)
+            else:
+                dec_attn_mask += tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), -1, -mask_shift_len)
+
         hids = []
         attentions = [] if inputs["output_attentions"] else None
         if self.attn_type == 0:  # default
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
index 53eb8239a5a4a0..af95f348ec28f7 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
@@ -20,7 +20,7 @@
 
 import tensorflow as tf
 
-from ...modeling_tf_utils import shape_list
+from ...tf_utils import shape_list
 
 
 class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index 446f92fec0c111..b462b378fc3748 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -393,7 +393,7 @@ def encode(
         Example:
 
         ```python
-        >>> from transformers import FlaxVisionEncoderDecoderModel
+        >>> from transformers import ViTFeatureExtractor, FlaxVisionEncoderDecoderModel
         >>> from PIL import Image
         >>> import requests
 
@@ -403,7 +403,9 @@ def encode(
         >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ... )
 
         >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
         >>> encoder_outputs = model.encode(pixel_values)
@@ -469,7 +471,7 @@ def decode(
         Example:
 
         ```python
-        >>> from transformers import FlaxVisionEncoderDecoderModel
+        >>> from transformers import ViTFeatureExtractor, FlaxVisionEncoderDecoderModel
         >>> import jax.numpy as jnp
         >>> from PIL import Image
         >>> import requests
@@ -480,7 +482,9 @@ def decode(
         >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ... )
 
         >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
         >>> encoder_outputs = model.encode(pixel_values)
@@ -610,7 +614,9 @@ def __call__(
         >>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ... )
 
         >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
 
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 06bcbf7c4b97d9..244c836b8c3f11 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -30,13 +30,8 @@
     replace_return_docstrings,
 )
 from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
-from ...modeling_tf_utils import (
-    TFCausalLanguageModelingLoss,
-    TFPreTrainedModel,
-    get_initializer,
-    input_processing,
-    shape_list,
-)
+from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, input_processing
+from ...tf_utils import shape_list
 from ...utils import logging
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index b1e027c964825e..9a7025c662d71e 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -32,8 +32,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_vit import ViTConfig
 
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 6c079fcbf268c3..6ef3a3f98d02f6 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -30,13 +30,8 @@
     replace_return_docstrings,
 )
 from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
-from ...modeling_tf_utils import (
-    TFPreTrainedModel,
-    booleans_processing,
-    get_initializer,
-    keras_serializable,
-    shape_list,
-)
+from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable
+from ...tf_utils import shape_list
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
 from .configuration_wav2vec2 import Wav2Vec2Config
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index c31b209c1879a6..ca59a948ff3bed 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -42,9 +42,15 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput):
     Args:
         text (list of `str`):
             Decoded logits in text from. Usually the speech transcription.
+        logit_score (list of `float`):
+            Total logit score of the beam associated with produced text.
+        lm_score (list of `float`):
+            Fused lm_score of the beam associated with produced text.
     """
 
     text: Union[List[str], str]
+    logit_score: Union[List[float], float] = None
+    lm_score: Union[List[float], float] = None
 
 
 class Wav2Vec2ProcessorWithLM(ProcessorMixin):
@@ -127,11 +133,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
 
-        if os.path.isdir(pretrained_model_name_or_path):
+        if os.path.isdir(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path):
             decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path)
         else:
             # BeamSearchDecoderCTC has no auto class
             kwargs.pop("_from_auto", None)
+            # snapshot_download has no `trust_remote_code` flag
+            kwargs.pop("trust_remote_code", None)
 
             # make sure that only relevant filenames are downloaded
             language_model_filenames = os.path.join(BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*")
@@ -283,7 +291,8 @@ def batch_decode(
         )
 
         # create multiprocessing pool and list numpy arrays
-        logits_list = [array for array in logits]
+        # filter out logits padding
+        logits_list = [array[(array != -100.0).all(axis=-1)] for array in logits]
         pool = get_context("fork").Pool(num_processes)
 
         # pyctcdecode
@@ -300,11 +309,14 @@ def batch_decode(
         # clone multi-processing pool
         pool.close()
 
-        # extract text
-        batch_texts = [d[0][0] for d in decoded_beams]
-
+        # extract text and scores
+        batch_texts, logit_scores, lm_scores = [], [], []
+        for d in decoded_beams:
+            batch_texts.append(d[0][0])
+            logit_scores.append(d[0][-2])
+            lm_scores.append(d[0][-1])
         # more output features will be added in the future
-        return Wav2Vec2DecoderWithLMOutput(text=batch_texts)
+        return Wav2Vec2DecoderWithLMOutput(text=batch_texts, logit_score=logit_scores, lm_score=lm_scores)
 
     def decode(
         self,
@@ -379,7 +391,9 @@ def decode(
         )
 
         # more output features will be added in the future
-        return Wav2Vec2DecoderWithLMOutput(text=decoded_beams[0][0])
+        return Wav2Vec2DecoderWithLMOutput(
+            text=decoded_beams[0][0], logit_score=decoded_beams[0][-2], lm_score=decoded_beams[0][-1]
+        )
 
     @contextmanager
     def as_target_processor(self):
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index 6d6ff088ec0e9a..1554fa3103b269 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -50,8 +50,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_xlm import XLMConfig
 
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index c31b82d7862c1d..ea0f6b6baf844f 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -44,8 +44,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_xlnet import XLNetConfig
 
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index 4d368cabf072a2..60b9dca7831b76 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -305,7 +305,7 @@ def __init__(
         if not no_deprecation_warning:
             warnings.warn(
                 "This implementation of AdamW is deprecated and will be removed in a future version. Use the"
-                "PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning",
+                " PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning",
                 FutureWarning,
             )
         require_version("torch>=1.5.0")  # add_ with alpha
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index fab5ccb00853f8..ca6cb37547bbaf 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -444,7 +444,7 @@ def pipeline(
             If no framework is specified, will default to the one currently installed. If no framework is specified and
             both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
             provided.
-        revision(`str`, *optional*, defaults to `"main"`):
+        revision (`str`, *optional*, defaults to `"main"`):
             When passing a task name or a string model identifier: The specific model version to use. It can be a
             branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
             artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
@@ -452,8 +452,7 @@ def pipeline(
             Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
         use_auth_token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`). revision(`str`, *optional*, defaults to
-            `"main"`):
+            when running `transformers-cli login` (stored in `~/.huggingface`).
         model_kwargs:
             Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
             **model_kwargs)` function.
@@ -621,15 +620,20 @@ def pipeline(
                     import kenlm  # to trigger `ImportError` if not installed
                     from pyctcdecode import BeamSearchDecoderCTC
 
-                    language_model_glob = os.path.join(BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*")
-                    alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
-                    allow_regex = [language_model_glob, alphabet_filename]
+                    if os.path.isdir(model_name) or os.path.isfile(model_name):
+                        decoder = BeamSearchDecoderCTC.load_from_dir(model_name)
+                    else:
+                        language_model_glob = os.path.join(
+                            BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
+                        )
+                        alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
+                        allow_regex = [language_model_glob, alphabet_filename]
+                        decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_regex=allow_regex)
 
-                    decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_regex=allow_regex)
                     kwargs["decoder"] = decoder
                 except ImportError as e:
                     logger.warning(
-                        "Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}"
+                        f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}"
                     )
 
     if task == "translation" and model.config.task_specific_params:
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index e57fb7d5e45ca1..df0c24a5a521c4 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import defaultdict
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Dict, Optional, Union
 
 import numpy as np
 
@@ -180,7 +180,11 @@ def _sanitize_parameters(self, **kwargs):
         if "stride_length_s" in kwargs:
             preprocess_params["stride_length_s"] = kwargs["stride_length_s"]
 
-        return preprocess_params, {}, {}
+        postprocess_params = {}
+        if "decoder_kwargs" in kwargs:
+            postprocess_params["decoder_kwargs"] = kwargs["decoder_kwargs"]
+
+        return preprocess_params, {}, postprocess_params
 
     def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
         if isinstance(inputs, str):
@@ -319,7 +323,7 @@ def _forward(self, model_inputs):
         extra = model_inputs
         return {"is_last": is_last, **out, **extra}
 
-    def postprocess(self, model_outputs):
+    def postprocess(self, model_outputs, decoder_kwargs: Optional[Dict] = None):
         if self.type == "ctc_with_lm":
             final_logits = []
             for outputs in model_outputs:
@@ -334,9 +338,11 @@ def postprocess(self, model_outputs):
                     right_n = total_n - right
                     logits = logits[:, left:right_n]
                 final_logits.append(logits)
+            if decoder_kwargs is None:
+                decoder_kwargs = {}
             logits = np.concatenate(final_logits, axis=1)
             logits = logits.squeeze(0)
-            text = self.decoder.decode_beams(logits)[0][0]
+            text = self.decoder.decode_beams(logits, **decoder_kwargs)[0][0]
         else:
             skip_special_tokens = self.type != "ctc"
             tokens = np.concatenate([outputs["tokens"].numpy() for outputs in model_outputs], axis=-1)
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 5445b718e344e5..fbfe56375d86eb 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -49,6 +49,9 @@
     from torch.utils.data import DataLoader, Dataset
 
     from ..models.auto.modeling_auto import AutoModel
+
+    # Re-export for backward compatibility
+    from .pt_utils import KeyDataset
 else:
     Dataset = None
     KeyDataset = None
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 61edb8b3004404..efab83b92f9821 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -182,6 +182,8 @@ def _sanitize_parameters(
             preprocess_params["doc_stride"] = doc_stride
         if max_question_len is not None:
             preprocess_params["max_question_len"] = max_question_len
+        if max_seq_len is not None:
+            preprocess_params["max_seq_len"] = max_seq_len
 
         postprocess_params = {}
         if topk is not None and top_k is None:
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ec6196c862a9cc..dad3d5c7d6135d 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -17,10 +17,18 @@
 """
 
 import importlib.util
+import os
 from pathlib import Path
 
+from .dynamic_module_utils import custom_object_save
+from .file_utils import PushToHubMixin, copy_func
+from .tokenization_utils_base import PreTrainedTokenizerBase
+from .utils import logging
 
-# Comment to write
+
+logger = logging.get_logger(__name__)
+
+# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
 spec = importlib.util.spec_from_file_location(
     "transformers", Path(__file__).parent / "__init__.py", submodule_search_locations=[Path(__file__).parent]
 )
@@ -33,7 +41,7 @@
 }
 
 
-class ProcessorMixin:
+class ProcessorMixin(PushToHubMixin):
     """
     This is a mixin used to provide saving/loading functionality for all processor classes.
     """
@@ -42,6 +50,7 @@ class ProcessorMixin:
     # Names need to be attr_class for attr in attributes
     feature_extractor_class = None
     tokenizer_class = None
+    _auto_class = None
 
     # args have to match the attributes class attribute
     def __init__(self, *args, **kwargs):
@@ -83,7 +92,7 @@ def __repr__(self):
         attributes_repr = "\n".join(attributes_repr)
         return f"{self.__class__.__name__}:\n{attributes_repr}"
 
-    def save_pretrained(self, save_directory):
+    def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
         """
         Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
         can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
@@ -100,7 +109,32 @@ def save_pretrained(self, save_directory):
             save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your processor to the Hugging Face model hub after saving it.
+
+                <Tip warning={true}>
+
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
+                which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
+                folder. Pass along `temp_dir=True` to use a temporary directory instead.
+
+                </Tip>
+
+            kwargs:
+                Additional key word arguments passed along to the [`~file_utils.PushToHubMixin.push_to_hub`] method.
         """
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo = self._create_or_get_repo(save_directory, **kwargs)
+
+        os.makedirs(save_directory, exist_ok=True)
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
+            configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
+            custom_object_save(self, save_directory, config=configs)
+
         for attribute_name in self.attributes:
             attribute = getattr(self, attribute_name)
             # Include the processor class in the attribute config so this processor can then be reloaded with the
@@ -109,6 +143,17 @@ def save_pretrained(self, save_directory):
                 attribute._set_processor_class(self.__class__.__name__)
             attribute.save_pretrained(save_directory)
 
+        if self._auto_class is not None:
+            # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
+            for attribute_name in self.attributes:
+                attribute = getattr(self, attribute_name)
+                if isinstance(attribute, PreTrainedTokenizerBase):
+                    del attribute.init_kwargs["auto_map"]
+
+        if push_to_hub:
+            url = self._push_to_hub(repo, commit_message=commit_message)
+            logger.info(f"Processor pushed to the hub in this commit: {url}")
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
@@ -142,6 +187,32 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(*args)
 
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoProcessor"):
+        """
+        Register this class with a given auto class. This should only be used for custom feature extractors as the ones
+        in the library are already mapped with `AutoProcessor`.
+
+        <Tip warning={true}>
+
+        This API is experimental and may have some slight breaking changes in the next releases.
+
+        </Tip>
+
+        Args:
+            auto_class (`str` or `type`, *optional*, defaults to `"AutoProcessor"`):
+                The auto class to register this new feature extractor with.
+        """
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+
+        import transformers.models.auto as auto_module
+
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+
+        cls._auto_class = auto_class
+
     @classmethod
     def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         args = []
@@ -159,3 +230,9 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
 
             args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
         return args
+
+
+ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
+ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
+    object="processor", object_class="AutoProcessor", object_files="processor files"
+)
diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py
new file mode 100644
index 00000000000000..42c744be7a67d9
--- /dev/null
+++ b/src/transformers/tf_utils.py
@@ -0,0 +1,51 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+import numpy as np
+import tensorflow as tf
+
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def set_tensor_by_indices_to_value(tensor: tf.Tensor, indices: tf.Tensor, value: Union[tf.Tensor, int, float]):
+    # create value_tensor since tensor value assignment is not possible in TF
+    return tf.where(indices, value, tensor)
+
+
+def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]:
+    """
+    Deal with dynamic shape in tensorflow cleanly.
+
+    Args:
+        tensor (`tf.Tensor` or `np.ndarray`): The tensor we want the shape of.
+
+    Returns:
+        `List[int]`: The shape of the tensor as a list.
+    """
+    if isinstance(tensor, np.ndarray):
+        return list(tensor.shape)
+
+    dynamic = tf.shape(tensor)
+
+    if tensor.shape == tf.TensorShape(None):
+        return dynamic
+
+    static = tensor.shape.as_list()
+
+    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 27ed8b8984a722..0f3f14c575b629 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1738,16 +1738,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     else:
                         raise error
 
-                except RepositoryNotFoundError as err:
-                    logger.error(err)
+                except RepositoryNotFoundError:
                     raise EnvironmentError(
                         f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
                         "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
                         "pass a token having permission to this repo with `use_auth_token` or log in with "
                         "`huggingface-cli login` and pass `use_auth_token=True`."
                     )
-                except RevisionNotFoundError as err:
-                    logger.error(err)
+                except RevisionNotFoundError:
                     raise EnvironmentError(
                         f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
                         "for this model name. Check the model page at "
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 08c83f9f7fa9ef..0d0d8403b4e36c 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1638,7 +1638,7 @@ def _load_rng_state(self, checkpoint):
                 try:
                     torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
                 except Exception as e:
-                    logger.infor(
+                    logger.info(
                         f"Didn't manage to set back the RNG states of the GPU because of the following error:\n {e}"
                         "\nThis won't yield the same results as if the training had not been interrupted."
                     )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f1982e0386c5c7..93dc4918d014ae 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2291,17 +2291,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MaskFormerPreTrainedModel(metaclass=DummyObject):
+class MaskFormerPretrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-def load_tf_weights_in_maskformer(*args, **kwargs):
-    requires_backends(load_tf_weights_in_maskformer, ["torch"])
-
-
 class MBartForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2797,6 +2793,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class PoolFormerForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PoolFormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PoolFormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 02b401ef394ec1..6bba825a889785 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -17,6 +17,48 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLogitsProcessorList(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFMinLengthLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFNoBadWordsLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFNoRepeatNGramLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFRepetitionPenaltyLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 def tf_top_k_top_p_filtering(*args, **kwargs):
     requires_backends(tf_top_k_top_p_filtering, ["tf"])
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index b52ca284d8bfc7..e0e8ec0d3dbbec 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -52,35 +52,35 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class MaskFormerFeatureExtractor(metaclass=DummyObject):
+class ImageGPTFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ImageGPTFeatureExtractor(metaclass=DummyObject):
+class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
+class LayoutLMv2Processor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class LayoutLMv2Processor(metaclass=DummyObject):
+class LayoutXLMProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class LayoutXLMProcessor(metaclass=DummyObject):
+class MaskFormerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
@@ -94,6 +94,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class PoolFormerFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class SegformerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 37b62d5772c875..3dbe073e683dae 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -53,8 +53,8 @@
     get_initializer,
     input_processing,
     keras_serializable,
-    shape_list,
 )
+from ...tf_utils import shape_list
 from ...utils import logging
 from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
 
@@ -1803,7 +1803,7 @@ def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAn
     TFWrappedEmbeddings,
     input_processing,
     keras_serializable,
-    shape_list,
+); from ...tf_utils import (shape_list,
 )
 from ...utils import logging
 from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
diff --git a/tests/test_activations.py b/tests/test_activations.py
index 2591352f39ff75..71b29913103479 100644
--- a/tests/test_activations.py
+++ b/tests/test_activations.py
@@ -40,6 +40,10 @@ def test_get_activation(self):
         get_activation("gelu_new")
         get_activation("gelu_fast")
         get_activation("gelu_python")
+        get_activation("quick_gelu")
+        get_activation("mish")
+        get_activation("linear")
+        get_activation("sigmoid")
         with self.assertRaises(KeyError):
             get_activation("bogus")
         with self.assertRaises(KeyError):
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 2b4a023d91c05c..a073c5250746fa 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -334,8 +334,12 @@ def test_repo_versioning_before(self):
         import transformers as new_transformers
 
         new_transformers.configuration_utils.__version__ = "v4.0.0"
-        new_configuration = new_transformers.models.auto.AutoConfig.from_pretrained(repo)
+        new_configuration, kwargs = new_transformers.models.auto.AutoConfig.from_pretrained(
+            repo, return_unused_kwargs=True
+        )
         self.assertEqual(new_configuration.hidden_size, 2)
+        # This checks `_configuration_file` ia not kept in the kwargs by mistake.
+        self.assertDictEqual(kwargs, {"_from_auto": True})
 
         # Testing an older version by monkey-patching the version in the module it's used.
         import transformers as old_transformers
diff --git a/tests/test_feature_extraction_auto.py b/tests/test_feature_extraction_auto.py
index c827b0a656916d..c8a785d2914c8a 100644
--- a/tests/test_feature_extraction_auto.py
+++ b/tests/test_feature_extraction_auto.py
@@ -15,13 +15,28 @@
 
 import json
 import os
+import sys
 import tempfile
 import unittest
-
-from transformers import AutoFeatureExtractor, Wav2Vec2Config, Wav2Vec2FeatureExtractor
+from pathlib import Path
+
+from transformers import (
+    CONFIG_MAPPING,
+    FEATURE_EXTRACTOR_MAPPING,
+    AutoConfig,
+    AutoFeatureExtractor,
+    Wav2Vec2Config,
+    Wav2Vec2FeatureExtractor,
+)
 from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
 
 
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig  # noqa E402
+from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
+
+
 SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
 SAMPLE_FEATURE_EXTRACTION_CONFIG = os.path.join(
     os.path.dirname(os.path.abspath(__file__)), "fixtures/dummy_feature_extractor_config.json"
@@ -82,3 +97,30 @@ def test_feature_extractor_not_found(self):
             "hf-internal-testing/config-no-model does not appear to have a file named preprocessor_config.json.",
         ):
             _ = AutoFeatureExtractor.from_pretrained("hf-internal-testing/config-no-model")
+
+    def test_from_pretrained_dynamic_feature_extractor(self):
+        model = AutoFeatureExtractor.from_pretrained(
+            "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
+        )
+        self.assertEqual(model.__class__.__name__, "NewFeatureExtractor")
+
+    def test_new_feature_extractor_registration(self):
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor)
+            # Trying to register something existing in the Transformers library will raise an error
+            with self.assertRaises(ValueError):
+                AutoFeatureExtractor.register(Wav2Vec2Config, Wav2Vec2FeatureExtractor)
+
+            # Now that the config is registered, it can be used as any other config with the auto-API
+            feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                feature_extractor.save_pretrained(tmp_dir)
+                new_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir)
+                self.assertIsInstance(new_feature_extractor, CustomFeatureExtractor)
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
+                del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index 217da135ca1cd3..098d982b14980f 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -16,9 +16,21 @@
 
 import json
 import os
+import sys
 import tempfile
+import unittest
+from pathlib import Path
 
+from huggingface_hub import Repository, delete_repo, login
+from requests.exceptions import HTTPError
+from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
 from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import PASS, USER, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
 
 
 if is_torch_available():
@@ -29,6 +41,9 @@
     from PIL import Image
 
 
+SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
+
+
 def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
     """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
     or a list of PyTorch tensors if one specifies torchify=True.
@@ -99,3 +114,77 @@ def test_feat_extract_from_and_save_pretrained(self):
     def test_init_without_params(self):
         feat_extract = self.feature_extraction_class()
         self.assertIsNotNone(feat_extract)
+
+
+@is_staging_test
+class FeatureExtractorPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = login(username=USER, password=PASS)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, name="test-feature-extractor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, name="test-feature-extractor-org", organization="valid_org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, name="test-dynamic-feature-extractor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            feature_extractor.save_pretrained(
+                os.path.join(tmp_dir, "test-feature-extractor"), push_to_hub=True, use_auth_token=self._token
+            )
+
+            new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
+            for k, v in feature_extractor.__dict__.items():
+                self.assertEqual(v, getattr(new_feature_extractor, k))
+
+    def test_push_to_hub_in_organization(self):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            feature_extractor.save_pretrained(
+                os.path.join(tmp_dir, "test-feature-extractor-org"),
+                push_to_hub=True,
+                use_auth_token=self._token,
+                organization="valid_org",
+            )
+
+            new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor-org")
+            for k, v in feature_extractor.__dict__.items():
+                self.assertEqual(v, getattr(new_feature_extractor, k))
+
+    def test_push_to_hub_dynamic_feature_extractor(self):
+        CustomFeatureExtractor.register_for_auto_class()
+        feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-feature-extractor", use_auth_token=self._token)
+            feature_extractor.save_pretrained(tmp_dir)
+
+            # This has added the proper auto_map field to the config
+            self.assertDictEqual(
+                feature_extractor.auto_map,
+                {"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
+            )
+            # The code has been copied from fixtures
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_feature_extraction.py")))
+
+            repo.push_to_hub()
+
+        new_feature_extractor = AutoFeatureExtractor.from_pretrained(
+            f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module
+        self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
diff --git a/tests/test_feature_extraction_maskformer.py b/tests/test_feature_extraction_maskformer.py
index 6e8094db654a63..17dafa42a0665f 100644
--- a/tests/test_feature_extraction_maskformer.py
+++ b/tests/test_feature_extraction_maskformer.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from transformers.file_utils import is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.testing_utils import require_torch, require_vision
 
 from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/test_feature_extraction_poolformer.py b/tests/test_feature_extraction_poolformer.py
new file mode 100644
index 00000000000000..cec912846c68c6
--- /dev/null
+++ b/tests/test_feature_extraction_poolformer.py
@@ -0,0 +1,193 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import PoolFormerFeatureExtractor
+
+
+class PoolFormerFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize_and_center_crop=True,
+        size=30,
+        crop_pct=0.9,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize_and_center_crop = do_resize_and_center_crop
+        self.size = size
+        self.crop_pct = crop_pct
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "size": self.size,
+            "do_resize_and_center_crop": self.do_resize_and_center_crop,
+            "crop_pct": self.crop_pct,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+
+@require_torch
+@require_vision
+class PoolFormerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = PoolFormerFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = PoolFormerFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "do_resize_and_center_crop"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "crop_pct"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
diff --git a/tests/test_generation_tf_logits_process.py b/tests/test_generation_tf_logits_process.py
new file mode 100644
index 00000000000000..fb9eb086e45197
--- /dev/null
+++ b/tests/test_generation_tf_logits_process.py
@@ -0,0 +1,172 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_tf
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.generation_tf_logits_process import (
+        TFLogitsProcessorList,
+        TFMinLengthLogitsProcessor,
+        TFNoBadWordsLogitsProcessor,
+        TFNoRepeatNGramLogitsProcessor,
+        TFRepetitionPenaltyLogitsProcessor,
+    )
+    from transformers.tf_utils import set_tensor_by_indices_to_value
+
+    from .test_modeling_tf_common import ids_tensor
+
+
+@require_tf
+class TFLogitsProcessorTest(unittest.TestCase):
+    def _get_uniform_logits(self, batch_size: int, length: int):
+        scores = tf.ones((batch_size, length), dtype=tf.float32) / length
+        return scores
+
+    def test_min_length_dist_processor(self):
+        vocab_size = 20
+        batch_size = 4
+        eos_token_id = 0
+
+        min_dist_processor = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
+
+        # check that min length is applied at length 5
+        input_ids = ids_tensor((batch_size, 5), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_before_min_length = min_dist_processor(input_ids, scores)
+        self.assertListEqual(scores_before_min_length[:, eos_token_id].numpy().tolist(), 4 * [-float("inf")])
+
+        # check that min length is not applied anymore at length 15
+        input_ids = ids_tensor((batch_size, 15), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_before_min_length = min_dist_processor(input_ids, scores)
+        self.assertFalse(tf.math.reduce_any(tf.math.is_inf(scores_before_min_length)).numpy())
+
+    def test_repetition_penalty_dist_process(self):
+        input_ids = tf.constant([[0, 1], [5, 0]], dtype=tf.int32)
+        vocab_size = 10
+
+        scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
+
+        mask = tf.cast(tf.constant([[1] + 9 * [0], 10 * [0]]), tf.bool)
+        scores = set_tensor_by_indices_to_value(scores, mask, -1 / vocab_size)
+        mask = tf.cast(tf.constant([10 * [0], 5 * [0] + [1] + 4 * [0]]), tf.bool)
+        scores = set_tensor_by_indices_to_value(scores, mask, 4 / vocab_size)
+
+        rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
+
+        scores = rep_penalty_proc(input_ids, tf.identity(scores))
+
+        # check that values were correctly changed
+        self.assertAlmostEqual(scores[0, 0].numpy(), -(1 / vocab_size) * 2)
+        self.assertAlmostEqual(scores[0, 1].numpy(), (1 / vocab_size) / 2)
+
+        self.assertAlmostEqual(scores[1, 0].numpy(), (1 / vocab_size) / 2)
+        self.assertAlmostEqual(scores[1, 5].numpy(), (4 / vocab_size) / 2)
+
+    def test_no_repeat_ngram_dist_processor(self):
+        vocab_size = 3
+        batch_size = 2
+
+        input_ids = tf.constant([[1, 1, 2, 1], [0, 1, 0, 1]], dtype=tf.int32)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+
+        no_repeat_proc_2_gram = TFNoRepeatNGramLogitsProcessor(2)
+        no_repeat_proc_3_gram = TFNoRepeatNGramLogitsProcessor(3)
+
+        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, tf.identity(scores))
+        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, tf.identity(scores))
+
+        # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
+        self.assertListEqual(
+            tf.math.is_inf(filtered_scores_2_gram).numpy().tolist(), [[False, True, True], [True, False, False]]
+        )
+
+        # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch
+        self.assertListEqual(
+            tf.math.is_inf(filtered_scores_3_gram).numpy().tolist(), [[False, False, False], [True, False, False]]
+        )
+
+    def test_no_bad_words_dist_processor(self):
+        vocab_size = 5
+        batch_size = 2
+        eos_token_id = 4
+
+        input_ids = tf.constant([[0, 1, 3, 1], [0, 1, 0, 1]], dtype=tf.int32)
+        bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+
+        no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=bad_word_tokens, eos_token_id=eos_token_id)
+
+        filtered_scores = no_bad_words_dist_proc(input_ids, tf.identity(scores))
+
+        # batch 1: 1st, 2nd, and 4th (0, 1, 3) token are forbidden
+        # batch 2: 1st, 2nd, and 3rd (0, 1, 2) token are forbidden
+        self.assertListEqual(
+            tf.math.is_inf(filtered_scores).numpy().tolist(),
+            [[True, True, False, True, True], [True, True, True, False, True]],
+        )
+
+    def test_processor_list(self):
+        batch_size = 4
+        sequence_length = 10
+        vocab_size = 15
+        eos_token_id = 0
+
+        # dummy input_ids and scores
+        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
+        input_ids_comp = tf.identity(input_ids)
+
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_comp = tf.identity(scores)
+
+        # instantiate all dist processors
+        min_dist_proc = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
+        rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
+        no_repeat_proc = TFNoRepeatNGramLogitsProcessor(2)
+        no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id)
+
+        # no processor list
+        scores = min_dist_proc(input_ids, scores)
+        scores = rep_penalty_proc(input_ids, scores)
+        scores = no_repeat_proc(input_ids, scores)
+        scores = no_bad_words_dist_proc(input_ids, scores)
+
+        # with processor list
+        processor = TFLogitsProcessorList(
+            [
+                min_dist_proc,
+                rep_penalty_proc,
+                no_repeat_proc,
+                no_bad_words_dist_proc,
+            ]
+        )
+        scores_comp = processor(input_ids, scores_comp)
+
+        # remove inf
+        scores = set_tensor_by_indices_to_value(scores, tf.math.is_inf(scores), -1e9)
+        scores_comp = set_tensor_by_indices_to_value(scores_comp, tf.math.is_inf(scores_comp), -1e9)
+
+        # scores should be equal
+        tf.debugging.assert_near(scores, scores_comp, atol=1e-3)
+
+        # input_ids should never be changed
+        self.assertListEqual(input_ids.numpy().tolist(), input_ids_comp.numpy().tolist())
diff --git a/tests/test_modeling_clip.py b/tests/test_modeling_clip.py
index 911ba291e641c7..353461bd1b93c9 100644
--- a/tests/test_modeling_clip.py
+++ b/tests/test_modeling_clip.py
@@ -625,15 +625,15 @@ def test_pt_tf_model_equivalence(self):
                 if type(tensor) == bool:
                     tf_inputs_dict[key] = tensor
                 elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
 
             # Check we can load pt model in tf and vice-versa with model => model functions
             tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device)
 
             # need to rename encoder-decoder "inputs" for PyTorch
             #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
@@ -650,7 +650,7 @@ def test_pt_tf_model_equivalence(self):
                     continue
 
                 tf_out = tf_output.numpy()
-                pt_out = pt_output.numpy()
+                pt_out = pt_output.cpu().numpy()
 
                 self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
 
@@ -676,6 +676,7 @@ def test_pt_tf_model_equivalence(self):
                 tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                 tf_model.save_weights(tf_checkpoint_path)
                 pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+                pt_model = pt_model.to(torch_device)
 
             # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
             pt_model.eval()
@@ -686,11 +687,11 @@ def test_pt_tf_model_equivalence(self):
                     tensor = np.array(tensor, dtype=bool)
                     tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32)
                 elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
 
             # need to rename encoder-decoder "inputs" for PyTorch
             #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
@@ -708,7 +709,7 @@ def test_pt_tf_model_equivalence(self):
                     continue
 
                 tf_out = tf_output.numpy()
-                pt_out = pt_output.numpy()
+                pt_out = pt_output.cpu().numpy()
 
                 self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 93fcfd0bb7a830..d9ef0687774e8f 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1475,17 +1475,17 @@ def test_pt_tf_model_equivalence(self):
                 if type(tensor) == bool:
                     tf_inputs_dict[key] = tensor
                 elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 elif key == "input_features":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
 
             # Check we can load pt model in tf and vice-versa with model => model functions
             tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device)
 
             # need to rename encoder-decoder "inputs" for PyTorch
             #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
@@ -1496,7 +1496,7 @@ def test_pt_tf_model_equivalence(self):
             tfo = tf_model(tf_inputs_dict, training=False)
 
             tf_hidden_states = tfo[0].numpy()
-            pt_hidden_states = pto[0].numpy()
+            pt_hidden_states = pto[0].cpu().numpy()
 
             tf_nans = np.copy(np.isnan(tf_hidden_states))
             pt_nans = np.copy(np.isnan(pt_hidden_states))
@@ -1518,6 +1518,7 @@ def test_pt_tf_model_equivalence(self):
                 tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                 tf_model.save_weights(tf_checkpoint_path)
                 pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+                pt_model = pt_model.to(torch_device)
 
             # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
             pt_model.eval()
@@ -1528,13 +1529,13 @@ def test_pt_tf_model_equivalence(self):
                     tensor = np.array(tensor, dtype=bool)
                     tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32)
                 elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 elif key == "input_features":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.float32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
                 else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.numpy(), dtype=tf.int32)
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
 
             # need to rename encoder-decoder "inputs" for PyTorch
             #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
@@ -1545,7 +1546,7 @@ def test_pt_tf_model_equivalence(self):
 
             tfo = tf_model(tf_inputs_dict)
             tfo = tfo[0].numpy()
-            pto = pto[0].numpy()
+            pto = pto[0].cpu().numpy()
             tf_nans = np.copy(np.isnan(tfo))
             pt_nans = np.copy(np.isnan(pto))
 
diff --git a/tests/test_modeling_lxmert.py b/tests/test_modeling_lxmert.py
index bbc123275ea636..652e6473c7f11d 100644
--- a/tests/test_modeling_lxmert.py
+++ b/tests/test_modeling_lxmert.py
@@ -776,16 +776,16 @@ def recursive_numpy_convert(iterable):
                     else:
                         if isinstance(value, (list, tuple)):
                             return_dict[key] = (
-                                tf.convert_to_tensor(iter_value.numpy(), dtype=tf.int32) for iter_value in value
+                                tf.convert_to_tensor(iter_value.cpu().numpy(), dtype=tf.int32) for iter_value in value
                             )
                         else:
-                            return_dict[key] = tf.convert_to_tensor(value.numpy(), dtype=tf.int32)
+                            return_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.int32)
                 return return_dict
 
             tf_inputs_dict = recursive_numpy_convert(pt_inputs)
 
             tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device)
 
             # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
             pt_model.eval()
@@ -795,12 +795,6 @@ def recursive_numpy_convert(iterable):
             if "obj_labels" in inputs_dict:
                 del inputs_dict["obj_labels"]
 
-            def torch_type(key):
-                if key in ("visual_feats", "visual_pos"):
-                    return torch.float32
-                else:
-                    return torch.long
-
             pt_inputs = self._prepare_for_class(inputs_dict, model_class)
             tf_inputs_dict = recursive_numpy_convert(pt_inputs)
 
@@ -808,7 +802,7 @@ def torch_type(key):
                 pto = pt_model(**pt_inputs)
             tfo = tf_model(tf_inputs_dict, training=False)
             tf_hidden_states = tfo[0].numpy()
-            pt_hidden_states = pto[0].numpy()
+            pt_hidden_states = pto[0].cpu().numpy()
 
             tf_nans = np.copy(np.isnan(tf_hidden_states))
             pt_nans = np.copy(np.isnan(pt_hidden_states))
@@ -852,7 +846,7 @@ def torch_type(key):
 
             tfo = tf_model(tf_inputs_dict)
             tfo = tfo[0].numpy()
-            pto = pto[0].numpy()
+            pto = pto[0].cpu().numpy()
             tf_nans = np.copy(np.isnan(tfo))
             pt_nans = np.copy(np.isnan(pto))
 
diff --git a/tests/test_modeling_mask_former.py b/tests/test_modeling_maskformer.py
similarity index 97%
rename from tests/test_modeling_mask_former.py
rename to tests/test_modeling_maskformer.py
index 2539e2a7890676..1293bae8561ffd 100644
--- a/tests/test_modeling_mask_former.py
+++ b/tests/test_modeling_maskformer.py
@@ -17,23 +17,18 @@
 
 import unittest
 
-import pytest
-
-from parameterized import parameterized
 from tests.test_modeling_common import floats_tensor
 from transformers import MaskFormerConfig, is_torch_available, is_vision_available
 from transformers.file_utils import cached_property
 from transformers.models.maskformer.configuration_maskformer import MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP
-from transformers.models.maskformer.MaskFormer.mask_former.mask_former_model import MaskFormer
 from transformers.models.maskformer.modeling_maskformer import (
     MaskFormerForInstanceSegmentationOutput,
     MaskFormerOutput,
 )
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.trainer_callback import TrainerState
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from .test_modeling_common import ModelTesterMixin
 
 
 if is_torch_available():
diff --git a/tests/test_modeling_poolformer.py b/tests/test_modeling_poolformer.py
new file mode 100644
index 00000000000000..afbb5e1a6f7575
--- /dev/null
+++ b/tests/test_modeling_poolformer.py
@@ -0,0 +1,331 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch PoolFormer model. """
+
+
+import inspect
+import unittest
+from typing import Dict, List, Tuple
+
+from transformers import is_torch_available, is_vision_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import MODEL_MAPPING, PoolFormerConfig, PoolFormerForImageClassification, PoolFormerModel
+    from transformers.models.poolformer.modeling_poolformer import POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import PoolFormerFeatureExtractor
+
+
+class PoolFormerConfigTester(ConfigTester):
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
+        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
+
+
+class PoolFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=64,
+        num_channels=3,
+        num_encoder_blocks=4,
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        hidden_sizes=[16, 32, 64, 128],
+        downsampling_rates=[1, 4, 8, 16],
+        is_training=False,
+        use_labels=True,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.sr_ratios = sr_ratios
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.downsampling_rates = downsampling_rates
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = PoolFormerConfig(
+            image_size=self.image_size,
+            num_channels=self.num_channels,
+            num_encoder_blocks=self.num_encoder_blocks,
+            depths=self.depths,
+            hidden_sizes=self.hidden_sizes,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, pixel_values, labels
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = PoolFormerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        expected_height = expected_width = self.image_size // 32.0
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.hidden_sizes[-1], expected_height, expected_width)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class PoolFormerModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (PoolFormerModel, PoolFormerForImageClassification) if is_torch_available() else ()
+
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = PoolFormerModelTester(self)
+        self.config_tester = PoolFormerConfigTester(self, config_class=PoolFormerConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip("PoolFormer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("PoolFormer does not have get_input_embeddings method and get_output_embeddings methods")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # Since poolformer doesn't use Attention
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        hidden_states = outputs.hidden_states[0]
+
+        hidden_states.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.",
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @unittest.skip("PoolFormer does not have attention")
+    def test_attention_outputs(self):
+        pass
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_layers = self.model_tester.num_encoder_blocks
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # verify the first hidden states (first block)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-3:]),
+                [
+                    self.model_tester.hidden_sizes[0],
+                    self.model_tester.image_size // 4,
+                    self.model_tester.image_size // 4,
+                ],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            if model_class in get_values(MODEL_MAPPING):
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = PoolFormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+class PoolFormerModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_image_classification_head(self):
+        feature_extractor = PoolFormerFeatureExtractor()
+        model = PoolFormerForImageClassification.from_pretrained("sail/poolformer_s12").to(torch_device)
+
+        inputs = feature_extractor(images=prepare_img(), return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.6113, 0.1685, -0.0492]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/test_modeling_speech_encoder_decoder.py b/tests/test_modeling_speech_encoder_decoder.py
index 6b0e5cf12fb840..6a5f1b589ca1e0 100644
--- a/tests/test_modeling_speech_encoder_decoder.py
+++ b/tests/test_modeling_speech_encoder_decoder.py
@@ -308,6 +308,11 @@ def check_encoder_decoder_model_generate(
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
         enc_dec_model.to(torch_device)
 
+        # make sure EOS token is set to None to prevent early stopping of generation
+        enc_dec_model.config.eos_token_id = None
+        if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"):
+            enc_dec_model.config.decoder.eos_token_id = None
+
         inputs = input_values if input_features is None else input_features
 
         # Bert does not have a bos token id, so use pad_token_id instead
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 95c953a6e3aec6..f293d8126fe593 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -955,7 +955,7 @@ def test_lm_head_model_random_no_beam_search_generate(self):
                 # Models with non-text inputs won't work here; num_return_sequences = 1
                 self._check_generated_ids(model.generate(do_sample=True, max_length=5))
 
-            with self.assertRaises(AssertionError):
+            with self.assertRaises(ValueError):
                 # generating multiple sequences when no beam search generation
                 # is not allowed as it would always generate the same sequences
                 model.generate(input_ids, do_sample=False, num_return_sequences=2)
diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py
index d653329a5e82ec..4f66ec89f4e2ab 100644
--- a/tests/test_modeling_tf_gpt2.py
+++ b/tests/test_modeling_tf_gpt2.py
@@ -26,14 +26,15 @@
 if is_tf_available():
     import tensorflow as tf
 
+    from transformers import GPT2Tokenizer
     from transformers.models.gpt2.modeling_tf_gpt2 import (
         TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
         TFGPT2DoubleHeadsModel,
         TFGPT2ForSequenceClassification,
         TFGPT2LMHeadModel,
         TFGPT2Model,
-        shape_list,
     )
+    from transformers.tf_utils import shape_list
 
 
 class TFGPT2ModelTester:
@@ -428,60 +429,53 @@ def test_model_from_pretrained(self):
 @require_tf
 class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
     @slow
-    def test_lm_generate_gpt2(self):
-        model = TFGPT2LMHeadModel.from_pretrained("gpt2")
-        input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32)  # The dog
-        expected_output_ids = [
-            464,
-            3290,
-            373,
-            1043,
-            287,
-            257,
-            2214,
-            1474,
-            262,
-            16246,
-            286,
-            2688,
-            290,
-            2688,
-            27262,
-            13,
-            198,
-            198,
-            464,
-            3290,
-        ]  # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
+    def test_lm_generate_distilgpt2(self):
+        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
+        input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32)  # The president
+
+        # The president of the United States, and the president of the United Kingdom, have been in the White
+        # fmt: off
+        expected_output_ids = [464, 1893, 286, 262, 1578, 1829, 11, 290, 262, 1893, 286, 262, 1578, 7526, 11, 423, 587, 287, 262, 2635]
+        # fmt: on
+
         output_ids = model.generate(input_ids, do_sample=False)
         self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
 
     @slow
-    def test_lm_generate_distilgpt2(self):
+    def test_lm_generate_distilgpt2_batch_special(self):
         model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
-        input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32)  # The president
-        expected_output_ids = [
-            464,
-            1893,
-            286,
-            262,
-            1578,
-            1829,
-            11,
-            290,
-            262,
-            1893,
-            286,
-            262,
-            1578,
-            7526,
-            11,
-            423,
-            587,
-            287,
-            262,
-            2635,
-        ]  # The president of the United States, and the president of the United Kingdom, have been in the White
+        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
+
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+
+        sentences = ["Today is a beautiful day and", "Yesterday was"]
+        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
+
+        generation_kwargs = {
+            "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
+            "no_repeat_ngram_size": 2,
+            "do_sample": False,
+            "repetition_penalty": 1.3,
+        }
+
+        output_ids = model.generate(input_ids, **generation_kwargs)
+
+        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        expected_output_string = [
+            "Today is a beautiful day and I am so happy to be able take part in this amazing event.",
+            "Yesterday was a very busy day for the first time since I started writing this post",
+        ]
+        self.assertListEqual(output_strings, expected_output_string)
+
+    @slow
+    def test_lm_generate_gpt2(self):
+        model = TFGPT2LMHeadModel.from_pretrained("gpt2")
+        input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32)  # The dog
 
+        # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
+        # fmt: off
+        expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290]
+        # fmt: on
         output_ids = model.generate(input_ids, do_sample=False)
         self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/tests/test_modeling_tf_longformer.py b/tests/test_modeling_tf_longformer.py
index b88437a1373fa5..be96de22afa02b 100644
--- a/tests/test_modeling_tf_longformer.py
+++ b/tests/test_modeling_tf_longformer.py
@@ -36,14 +36,7 @@
         TFLongformerModel,
         TFLongformerSelfAttention,
     )
-
-    def shape_list(x):
-        """
-        copied from transformers.modeling_tf_utils
-        """
-        static = x.shape.as_list()
-        dynamic = tf.shape(x)
-        return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+    from transformers.tf_utils import shape_list
 
 
 class TFLongformerModelTester:
diff --git a/tests/test_modeling_tf_mt5.py b/tests/test_modeling_tf_mt5.py
index 9b23e05f7523f5..1ab1a635b396c1 100644
--- a/tests/test_modeling_tf_mt5.py
+++ b/tests/test_modeling_tf_mt5.py
@@ -22,7 +22,24 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
+    from transformers import AutoTokenizer, T5Tokenizer, TFAutoModelForSeq2SeqLM, TFMT5ForConditionalGeneration
+
+
+@require_tf
+class TFMT5ModelTest(unittest.TestCase):  # no mixin with common tests -> most cases are already covered in the TF T5
+    @slow
+    def test_resize_embeddings(self):
+        model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+        original_vocab_size = model.get_input_embeddings().weight.shape[0]
+        # the vocab size is defined in the model config
+        self.assertEqual(original_vocab_size, model.config.vocab_size)
+
+        tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+        tokenizer.add_special_tokens({"bos_token": "", "eos_token": ""})
+        model._resize_token_embeddings(len(tokenizer))
+        # the vocab size is now resized to the length of the tokenizer, which is different from the original size
+        self.assertEqual(model.get_input_embeddings().weight.shape[0], len(tokenizer))
+        self.assertNotEqual(model.get_input_embeddings().weight.shape[0], original_vocab_size)
 
 
 @require_tf
diff --git a/tests/test_modeling_tf_speech_to_text.py b/tests/test_modeling_tf_speech_to_text.py
index e34892bf126a74..6253ccf9539d0b 100644
--- a/tests/test_modeling_tf_speech_to_text.py
+++ b/tests/test_modeling_tf_speech_to_text.py
@@ -474,7 +474,7 @@ def test_lm_head_model_random_no_beam_search_generate(self):
                 # num_return_sequences = 1
                 self._check_generated_ids(model.generate(input_features, do_sample=True))
 
-            with self.assertRaises(AssertionError):
+            with self.assertRaises(ValueError):
                 # generating multiple sequences when no beam search generation
                 # is not allowed as it would always generate the same sequences
                 model.generate(input_features, do_sample=False, num_return_sequences=2)
diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py
index 59ee70c53ec2e7..9a5a1de199bd30 100644
--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
@@ -314,6 +314,20 @@ def test_generate_with_headmasking(self):
         # TODO: Fix head-masking according to PyTorch T5 model
         pass
 
+    @slow
+    def test_resize_embeddings(self):
+        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
+        original_vocab_size = model.get_input_embeddings().weight.shape[0]
+        # the vocab size is defined in the model config
+        self.assertEqual(original_vocab_size, model.config.vocab_size)
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        tokenizer.add_special_tokens({"bos_token": "", "eos_token": ""})
+        model._resize_token_embeddings(len(tokenizer))
+        # the vocab size is now resized to the length of the tokenizer, which is different from the original size
+        self.assertEqual(model.get_input_embeddings().weight.shape[0], len(tokenizer))
+        self.assertNotEqual(model.get_input_embeddings().weight.shape[0], original_vocab_size)
+
 
 class TFT5EncoderOnlyModelTester:
     def __init__(
@@ -439,6 +453,34 @@ def test_train_pipeline_custom_model(self):
         pass
 
 
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFT5GenerationIntegrationTests(unittest.TestCase):
+    @slow
+    def test_greedy_generate(self):
+        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+
+        sentences = ["Yesterday, my name was", "Today is a beautiful day and"]
+        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
+
+        generation_kwargs = {
+            "bad_words_ids": [tokenizer("my").input_ids, tokenizer("ein schöner").input_ids],
+            "no_repeat_ngram_size": 3,
+            "do_sample": False,
+            "repetition_penalty": 2.2,
+        }
+
+        output_ids = model.generate(input_ids, **generation_kwargs)
+
+        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        expected_output_string = ["Yesterday, my name was", "Heute ist ein schöne Tag und"]
+
+        self.assertListEqual(expected_output_string, output_strings)
+
+
 @require_tf
 @require_sentencepiece
 @require_tokenizers
diff --git a/tests/test_pipelines_automatic_speech_recognition.py b/tests/test_pipelines_automatic_speech_recognition.py
index 15b5f72612cdf6..5e1adbc27d314d 100644
--- a/tests/test_pipelines_automatic_speech_recognition.py
+++ b/tests/test_pipelines_automatic_speech_recognition.py
@@ -18,6 +18,7 @@
 import pytest
 from datasets import load_dataset
 
+from huggingface_hub import snapshot_download
 from transformers import (
     MODEL_FOR_CTC_MAPPING,
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
@@ -363,6 +364,34 @@ def test_with_lm_fast(self):
         n_repeats = 2
         audio_tiled = np.tile(audio, n_repeats)
 
+        output = speech_recognizer([audio_tiled], batch_size=2)
+        self.assertEqual(output, [{"text": ANY(str)}])
+        self.assertEqual(output[0]["text"][:6], "<s> <s")
+
+        # Making sure the argument are passed to the decoder
+        # Since no change happens in the result, check the error comes from
+        # the `decode_beams` function.
+        with self.assertRaises(TypeError) as e:
+            output = speech_recognizer([audio_tiled], decoder_kwargs={"num_beams": 2})
+            self.assertContains(e.msg, "TypeError: decode_beams() got an unexpected keyword argument 'num_beams'")
+        output = speech_recognizer([audio_tiled], decoder_kwargs={"beam_width": 2})
+
+    @require_torch
+    @require_pyctcdecode
+    def test_with_local_lm_fast(self):
+        local_dir = snapshot_download("hf-internal-testing/processor_with_lm")
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model=local_dir,
+        )
+        self.assertEqual(speech_recognizer.type, "ctc_with_lm")
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        n_repeats = 2
+        audio_tiled = np.tile(audio, n_repeats)
+
         output = speech_recognizer([audio_tiled], batch_size=2)
 
         self.assertEqual(output, [{"text": ANY(str)}])
diff --git a/tests/test_pipelines_image_classification.py b/tests/test_pipelines_image_classification.py
index 24062b705adcc7..8e19d60e80fb70 100644
--- a/tests/test_pipelines_image_classification.py
+++ b/tests/test_pipelines_image_classification.py
@@ -113,19 +113,13 @@ def run_pipeline_test(self, image_classifier, examples):
 
     @require_torch
     def test_small_model_pt(self):
-        small_model = "lysandre/tiny-vit-random"
+        small_model = "hf-internal-testing/tiny-random-vit"
         image_classifier = pipeline("image-classification", model=small_model)
 
         outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
-                {"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
-                {"score": 0.0014, "label": "trench coat"},
-                {"score": 0.0014, "label": "handkerchief, hankie, hanky, hankey"},
-                {"score": 0.0014, "label": "baboon"},
-            ],
+            [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
         )
 
         outputs = image_classifier(
@@ -138,32 +132,20 @@ def test_small_model_pt(self):
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
             [
-                [
-                    {"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
-                    {"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
-                ],
-                [
-                    {"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
-                    {"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
-                ],
+                [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
+                [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
             ],
         )
 
     @require_tf
     def test_small_model_tf(self):
-        small_model = "lysandre/tiny-vit-random"
+        small_model = "hf-internal-testing/tiny-random-vit"
         image_classifier = pipeline("image-classification", model=small_model)
 
         outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
-                {"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
-                {"score": 0.0014, "label": "trench coat"},
-                {"score": 0.0014, "label": "handkerchief, hankie, hanky, hankey"},
-                {"score": 0.0014, "label": "baboon"},
-            ],
+            [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
         )
 
         outputs = image_classifier(
@@ -176,14 +158,8 @@ def test_small_model_tf(self):
         self.assertEqual(
             nested_simplify(outputs, decimals=4),
             [
-                [
-                    {"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
-                    {"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
-                ],
-                [
-                    {"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
-                    {"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
-                ],
+                [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
+                [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
             ],
         )
 
@@ -191,7 +167,9 @@ def test_custom_tokenizer(self):
         tokenizer = PreTrainedTokenizer()
 
         # Assert that the pipeline can be initialized with a feature extractor that is not in any mapping
-        image_classifier = pipeline("image-classification", model="lysandre/tiny-vit-random", tokenizer=tokenizer)
+        image_classifier = pipeline(
+            "image-classification", model="hf-internal-testing/tiny-random-vit", tokenizer=tokenizer
+        )
 
         self.assertIs(image_classifier.tokenizer, tokenizer)
 
diff --git a/tests/test_processor_auto.py b/tests/test_processor_auto.py
index 6f64480a98d38c..d4a543ee5cbb63 100644
--- a/tests/test_processor_auto.py
+++ b/tests/test_processor_auto.py
@@ -15,22 +15,51 @@
 
 import json
 import os
+import sys
 import tempfile
 import unittest
+from pathlib import Path
 from shutil import copyfile
 
-from transformers import AutoProcessor, AutoTokenizer, Wav2Vec2Config, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME
+from huggingface_hub import Repository, delete_repo, login
+from requests.exceptions import HTTPError
+from transformers import (
+    CONFIG_MAPPING,
+    FEATURE_EXTRACTOR_MAPPING,
+    PROCESSOR_MAPPING,
+    TOKENIZER_MAPPING,
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoProcessor,
+    AutoTokenizer,
+    Wav2Vec2Config,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Processor,
+)
+from transformers.file_utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available
+from transformers.testing_utils import PASS, USER, is_staging_test
 from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
 
 
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig  # noqa E402
+from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
+from test_module.custom_processing import CustomProcessor  # noqa E402
+from test_module.custom_tokenization import CustomTokenizer  # noqa E402
+
+
 SAMPLE_PROCESSOR_CONFIG = os.path.join(
     os.path.dirname(os.path.abspath(__file__)), "fixtures/dummy_feature_extractor_config.json"
 )
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/vocab.json")
 
+SAMPLE_PROCESSOR_CONFIG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
+
 
 class AutoFeatureExtractorTest(unittest.TestCase):
+    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
+
     def test_processor_from_model_shortcut(self):
         processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
         self.assertIsInstance(processor, Wav2Vec2Processor)
@@ -115,3 +144,167 @@ def test_processor_from_local_directory_from_model_config(self):
             processor = AutoProcessor.from_pretrained(tmpdirname)
 
         self.assertIsInstance(processor, Wav2Vec2Processor)
+
+    def test_from_pretrained_dynamic_processor(self):
+        processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor", trust_remote_code=True)
+        self.assertTrue(processor.special_attribute_present)
+        self.assertEqual(processor.__class__.__name__, "NewProcessor")
+
+        feature_extractor = processor.feature_extractor
+        self.assertTrue(feature_extractor.special_attribute_present)
+        self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
+
+        tokenizer = processor.tokenizer
+        self.assertTrue(tokenizer.special_attribute_present)
+        if is_tokenizers_available():
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+
+            # Test we can also load the slow version
+            processor = AutoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_processor", trust_remote_code=True, use_fast=False
+            )
+            tokenizer = processor.tokenizer
+            self.assertTrue(tokenizer.special_attribute_present)
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+        else:
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+
+    def test_new_processor_registration(self):
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor)
+            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
+            AutoProcessor.register(CustomConfig, CustomProcessor)
+            # Trying to register something existing in the Transformers library will raise an error
+            with self.assertRaises(ValueError):
+                AutoProcessor.register(Wav2Vec2Config, Wav2Vec2Processor)
+
+            # Now that the config is registered, it can be used as any other config with the auto-API
+            feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
+
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                vocab_file = os.path.join(tmp_dir, "vocab.txt")
+                with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                    vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+                tokenizer = CustomTokenizer(vocab_file)
+
+            processor = CustomProcessor(feature_extractor, tokenizer)
+
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                processor.save_pretrained(tmp_dir)
+                new_processor = AutoProcessor.from_pretrained(tmp_dir)
+                self.assertIsInstance(new_processor, CustomProcessor)
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
+                del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
+            if CustomConfig in TOKENIZER_MAPPING._extra_content:
+                del TOKENIZER_MAPPING._extra_content[CustomConfig]
+            if CustomConfig in PROCESSOR_MAPPING._extra_content:
+                del PROCESSOR_MAPPING._extra_content[CustomConfig]
+
+
+@is_staging_test
+class ProcessorPushToHubTester(unittest.TestCase):
+    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
+
+    @classmethod
+    def setUpClass(cls):
+        cls._token = login(username=USER, password=PASS)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, name="test-processor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, name="test-processor-org", organization="valid_org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, name="test-dynamic-processor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            processor.save_pretrained(
+                os.path.join(tmp_dir, "test-processor"), push_to_hub=True, use_auth_token=self._token
+            )
+
+            new_processor = Wav2Vec2Processor.from_pretrained(f"{USER}/test-processor")
+            for k, v in processor.feature_extractor.__dict__.items():
+                self.assertEqual(v, getattr(new_processor.feature_extractor, k))
+            self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
+
+    def test_push_to_hub_in_organization(self):
+        processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            processor.save_pretrained(
+                os.path.join(tmp_dir, "test-processor-org"),
+                push_to_hub=True,
+                use_auth_token=self._token,
+                organization="valid_org",
+            )
+
+            new_processor = Wav2Vec2Processor.from_pretrained("valid_org/test-processor-org")
+            for k, v in processor.feature_extractor.__dict__.items():
+                self.assertEqual(v, getattr(new_processor.feature_extractor, k))
+            self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
+
+    def test_push_to_hub_dynamic_processor(self):
+        CustomFeatureExtractor.register_for_auto_class()
+        CustomTokenizer.register_for_auto_class()
+        CustomProcessor.register_for_auto_class()
+
+        feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = CustomTokenizer(vocab_file)
+
+        processor = CustomProcessor(feature_extractor, tokenizer)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-processor", use_auth_token=self._token)
+            processor.save_pretrained(tmp_dir)
+
+            # This has added the proper auto_map field to the feature extractor config
+            self.assertDictEqual(
+                processor.feature_extractor.auto_map,
+                {
+                    "AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor",
+                    "AutoProcessor": "custom_processing.CustomProcessor",
+                },
+            )
+
+            # This has added the proper auto_map field to the tokenizer config
+            with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
+                tokenizer_config = json.load(f)
+            self.assertDictEqual(
+                tokenizer_config["auto_map"],
+                {
+                    "AutoTokenizer": ["custom_tokenization.CustomTokenizer", None],
+                    "AutoProcessor": "custom_processing.CustomProcessor",
+                },
+            )
+
+            # The code has been copied from fixtures
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_feature_extraction.py")))
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_tokenization.py")))
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_processing.py")))
+
+            repo.push_to_hub()
+
+        new_processor = AutoProcessor.from_pretrained(f"{USER}/test-dynamic-processor", trust_remote_code=True)
+        # Can't make an isinstance check because the new_processor is from the CustomProcessor class of a dynamic module
+        self.assertEqual(new_processor.__class__.__name__, "CustomProcessor")
diff --git a/tests/test_processor_wav2vec2_with_lm.py b/tests/test_processor_wav2vec2_with_lm.py
index f918a0894a47be..1d31b37c47d997 100644
--- a/tests/test_processor_wav2vec2_with_lm.py
+++ b/tests/test_processor_wav2vec2_with_lm.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 
+from transformers import AutoProcessor
 from transformers.file_utils import FEATURE_EXTRACTOR_NAME, is_pyctcdecode_available
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
@@ -31,6 +32,7 @@
 
 
 if is_pyctcdecode_available():
+    from huggingface_hub import snapshot_download
     from pyctcdecode import BeamSearchDecoderCTC
     from transformers.models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
 
@@ -177,12 +179,14 @@ def test_decoder(self):
 
         logits = self._get_dummy_logits(shape=(10, 16), seed=13)
 
-        decoded_processor = processor.decode(logits).text
+        decoded_processor = processor.decode(logits)
 
-        decoded_decoder = decoder.decode_beams(logits)[0][0]
+        decoded_decoder = decoder.decode_beams(logits)[0]
 
-        self.assertEqual(decoded_decoder, decoded_processor)
-        self.assertEqual("</s> <s> </s>", decoded_processor)
+        self.assertEqual(decoded_decoder[0], decoded_processor.text)
+        self.assertEqual("</s> <s> </s>", decoded_processor.text)
+        self.assertEqual(decoded_decoder[-2], decoded_processor.logit_score)
+        self.assertEqual(decoded_decoder[-1], decoded_processor.lm_score)
 
     def test_decoder_batch(self):
         feature_extractor = self.get_feature_extractor()
@@ -193,15 +197,22 @@ def test_decoder_batch(self):
 
         logits = self._get_dummy_logits()
 
-        decoded_processor = processor.batch_decode(logits).text
+        decoded_processor = processor.batch_decode(logits)
 
         logits_list = [array for array in logits]
         pool = get_context("fork").Pool()
-        decoded_decoder = [d[0][0] for d in decoder.decode_beams_batch(pool, logits_list)]
+        decoded_beams = decoder.decode_beams_batch(pool, logits_list)
+        texts_decoder, logit_scores_decoder, lm_scores_decoder = [], [], []
+        for beams in decoded_beams:
+            texts_decoder.append(beams[0][0])
+            logit_scores_decoder.append(beams[0][-2])
+            lm_scores_decoder.append(beams[0][-1])
         pool.close()
 
-        self.assertListEqual(decoded_decoder, decoded_processor)
-        self.assertListEqual(["<s> <s> </s>", "<s> <s> <s>"], decoded_processor)
+        self.assertListEqual(texts_decoder, decoded_processor.text)
+        self.assertListEqual(["<s> <s> </s>", "<s> <s> <s>"], decoded_processor.text)
+        self.assertListEqual(logit_scores_decoder, decoded_processor.logit_score)
+        self.assertListEqual(lm_scores_decoder, decoded_processor.lm_score)
 
     def test_decoder_with_params(self):
         feature_extractor = self.get_feature_extractor()
@@ -303,3 +314,39 @@ def test_decoder_download_ignores_files(self):
         # https://huggingface.co/hf-internal-testing/processor_with_lm/tree/main
         # are downloaded and none of the rest (e.g. README.md, ...)
         self.assertListEqual(downloaded_decoder_files, expected_decoder_files)
+
+    def test_decoder_local_files(self):
+        local_dir = snapshot_download("hf-internal-testing/processor_with_lm")
+
+        processor = Wav2Vec2ProcessorWithLM.from_pretrained(local_dir)
+
+        language_model = processor.decoder.model_container[processor.decoder._model_key]
+        path_to_cached_dir = Path(language_model._kenlm_model.path.decode("utf-8")).parent.parent.absolute()
+
+        local_decoder_files = os.listdir(local_dir)
+        expected_decoder_files = os.listdir(path_to_cached_dir)
+
+        local_decoder_files.sort()
+        expected_decoder_files.sort()
+
+        # test that both decoder form hub and local files in cache are the same
+        self.assertListEqual(local_decoder_files, expected_decoder_files)
+
+    def test_processor_from_auto_processor(self):
+        processor_wav2vec2 = Wav2Vec2ProcessorWithLM.from_pretrained("hf-internal-testing/processor_with_lm")
+        processor_auto = AutoProcessor.from_pretrained("hf-internal-testing/processor_with_lm")
+
+        raw_speech = floats_list((3, 1000))
+
+        input_wav2vec2 = processor_wav2vec2(raw_speech, return_tensors="np")
+        input_auto = processor_auto(raw_speech, return_tensors="np")
+
+        for key in input_wav2vec2.keys():
+            self.assertAlmostEqual(input_wav2vec2[key].sum(), input_auto[key].sum(), delta=1e-2)
+
+        logits = self._get_dummy_logits()
+
+        decoded_wav2vec2 = processor_wav2vec2.batch_decode(logits)
+        decoded_auto = processor_auto.batch_decode(logits)
+
+        self.assertListEqual(decoded_wav2vec2.text, decoded_auto.text)
diff --git a/tests/test_tokenization_auto.py b/tests/test_tokenization_auto.py
index a6608c90ca549c..ae4e5896508d34 100644
--- a/tests/test_tokenization_auto.py
+++ b/tests/test_tokenization_auto.py
@@ -310,6 +310,38 @@ def test_new_tokenizer_fast_registration(self):
             if CustomConfig in TOKENIZER_MAPPING._extra_content:
                 del TOKENIZER_MAPPING._extra_content[CustomConfig]
 
+    def test_from_pretrained_dynamic_tokenizer(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
+        self.assertTrue(tokenizer.special_attribute_present)
+        if is_tokenizers_available():
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+
+            # Test we can also load the slow version
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
+            )
+            self.assertTrue(tokenizer.special_attribute_present)
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+        else:
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+
+    def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
+        tokenizer = AutoTokenizer.from_pretrained(
+            "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True
+        )
+        self.assertTrue(tokenizer.special_attribute_present)
+        if is_tokenizers_available():
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+
+            # Test we can also load the slow version
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True, use_fast=False
+            )
+            self.assertTrue(tokenizer.special_attribute_present)
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+        else:
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+
     def test_repo_not_found(self):
         with self.assertRaisesRegex(
             EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 44c55b423c6590..e58ab9a816a509 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -3812,7 +3812,9 @@ def test_push_to_hub_dynamic_tokenizer(self):
 
             with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
                 tokenizer_config = json.load(f)
-            self.assertEqual(tokenizer_config["auto_map"], ["custom_tokenization.CustomTokenizer", None])
+            self.assertDictEqual(
+                tokenizer_config["auto_map"], {"AutoTokenizer": ["custom_tokenization.CustomTokenizer", None]}
+            )
 
             repo.push_to_hub()
 
@@ -3837,9 +3839,14 @@ def test_push_to_hub_dynamic_tokenizer(self):
 
             with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
                 tokenizer_config = json.load(f)
-            self.assertEqual(
+            self.assertDictEqual(
                 tokenizer_config["auto_map"],
-                ["custom_tokenization.CustomTokenizer", "custom_tokenization_fast.CustomTokenizerFast"],
+                {
+                    "AutoTokenizer": [
+                        "custom_tokenization.CustomTokenizer",
+                        "custom_tokenization_fast.CustomTokenizerFast",
+                    ]
+                },
             )
 
             repo.push_to_hub()
diff --git a/utils/test_module/custom_feature_extraction.py b/utils/test_module/custom_feature_extraction.py
new file mode 100644
index 00000000000000..de367032d8fe8e
--- /dev/null
+++ b/utils/test_module/custom_feature_extraction.py
@@ -0,0 +1,5 @@
+from transformers import Wav2Vec2FeatureExtractor
+
+
+class CustomFeatureExtractor(Wav2Vec2FeatureExtractor):
+    pass
diff --git a/utils/test_module/custom_processing.py b/utils/test_module/custom_processing.py
new file mode 100644
index 00000000000000..196fc511b65b3b
--- /dev/null
+++ b/utils/test_module/custom_processing.py
@@ -0,0 +1,6 @@
+from transformers import ProcessorMixin
+
+
+class CustomProcessor(ProcessorMixin):
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"