From cb058154a4c9de9aa70d7ef050c62217a0e611c1 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 9 Apr 2021 18:26:02 -0700
Subject: [PATCH 01/20] sync

---
 .circleci/config.yml                          |    8 +-
 .circleci/deploy.sh                           |    3 +-
 .github/workflows/self-scheduled.yml          |    4 +-
 .gitignore                                    |    3 +-
 README.md                                     |    3 +
 docs/source/_static/js/custom.js              |    7 +-
 docs/source/community.md                      |    1 +
 docs/source/conf.py                           |  104 +-
 docs/source/index.rst                         |   94 +-
 docs/source/main_classes/trainer.rst          |  670 +++++-
 docs/source/model_doc/auto.rst                |    7 +
 docs/source/model_doc/bert.rst                |    2 +-
 docs/source/model_doc/convbert.rst            |    3 +-
 docs/source/model_doc/cpm.rst                 |   44 +
 docs/source/model_doc/gpt_neo.rst             |    4 +-
 docs/source/model_doc/led.rst                 |    3 +-
 docs/source/model_doc/megatron_bert.rst       |  153 ++
 docs/source/model_doc/megatron_gpt2.rst       |   70 +
 docs/source/testing.rst                       |   23 +-
 examples/language-modeling/run_clm.py         |   23 +-
 .../language-modeling/run_clm_no_trainer.py   |    4 +-
 examples/language-modeling/run_mlm.py         |   13 +-
 .../language-modeling/run_mlm_no_trainer.py   |    4 +-
 examples/language-modeling/run_plm.py         |    4 +-
 .../legacy/question-answering/run_squad.py    |    2 +-
 examples/legacy/seq2seq/README.md             |    2 +-
 examples/legacy/seq2seq/seq2seq_trainer.py    |    4 +-
 examples/multiple-choice/run_swag.py          |    6 +-
 examples/question-answering/run_qa.py         |    4 +-
 .../question-answering/run_qa_beam_search.py  |    4 +-
 .../run_qa_beam_search_no_trainer.py          |  797 +++++++
 .../question-answering/run_qa_no_trainer.py   |  753 +++++++
 examples/question-answering/run_tf_squad.py   |    2 +-
 examples/question-answering/utils_qa.py       |    4 +-
 .../movement-pruning/masked_run_squad.py      |    2 +-
 examples/seq2seq/run_summarization.py         |    4 +-
 examples/seq2seq/run_translation.py           |   41 +-
 examples/tests/deepspeed/test_deepspeed.py    |  427 ----
 examples/text-classification/run_glue.py      |    6 +-
 .../run_glue_no_trainer.py                    |    2 +-
 examples/text-classification/run_xnli.py      |    2 +-
 examples/token-classification/run_ner.py      |    2 +-
 setup.py                                      |   47 +-
 src/transformers/__init__.py                  |   98 +-
 src/transformers/configuration_utils.py       |    2 +-
 src/transformers/convert_slow_tokenizer.py    |    6 +-
 src/transformers/data/data_collator.py        |   13 +-
 src/transformers/data/datasets/squad.py       |    2 +-
 src/transformers/data/metrics/__init__.py     |   12 +-
 src/transformers/dependency_versions_check.py |    6 +-
 src/transformers/dependency_versions_table.py |   11 +-
 src/transformers/feature_extraction_utils.py  |    9 +
 src/transformers/file_utils.py                |  102 +-
 src/transformers/generation_logits_process.py |    6 +-
 src/transformers/generation_utils.py          |  182 +-
 src/transformers/integrations.py              |  161 +-
 src/transformers/modeling_flax_utils.py       |    1 +
 src/transformers/modeling_tf_utils.py         |    9 +-
 src/transformers/modeling_utils.py            |   56 +-
 src/transformers/models/__init__.py           |    2 +
 .../models/albert/tokenization_albert.py      |    9 +-
 .../models/albert/tokenization_albert_fast.py |   31 -
 src/transformers/models/auto/__init__.py      |    4 +
 src/transformers/models/auto/auto_factory.py  |   39 +-
 .../models/auto/configuration_auto.py         |   23 +-
 .../models/auto/feature_extraction_auto.py    |  150 ++
 src/transformers/models/auto/modeling_auto.py |   26 +-
 .../models/auto/modeling_tf_auto.py           |    3 +-
 .../models/auto/tokenization_auto.py          |    5 +-
 src/transformers/models/bart/modeling_bart.py |    2 +-
 .../models/barthez/tokenization_barthez.py    |    9 +-
 .../barthez/tokenization_barthez_fast.py      |   30 -
 src/transformers/models/bert/modeling_bert.py |    2 +-
 .../models/bert/tokenization_bert.py          |    9 +-
 .../modeling_bert_generation.py               |    2 +-
 .../models/bertweet/tokenization_bertweet.py  |    9 +-
 .../models/big_bird/modeling_big_bird.py      |    2 +-
 .../models/big_bird/tokenization_big_bird.py  |    9 +-
 .../models/blenderbot/modeling_blenderbot.py  |    2 +-
 .../modeling_blenderbot_small.py              |    2 +-
 .../camembert/tokenization_camembert.py       |    9 +-
 .../camembert/tokenization_camembert_fast.py  |   30 -
 src/transformers/models/cpm/__init__.py       |   48 +
 .../models/cpm/tokenization_cpm.py            |  109 +
 .../models/deberta/tokenization_deberta.py    |    9 +-
 .../deberta_v2/tokenization_deberta_v2.py     |   12 +-
 .../models/electra/modeling_electra.py        |    2 +-
 .../models/fsmt/tokenization_fsmt.py          |   12 +-
 src/transformers/models/gpt2/modeling_gpt2.py |    2 +-
 .../models/gpt_neo/modeling_gpt_neo.py        |  423 ++--
 .../herbert/tokenization_herbert_fast.py      |    9 +-
 .../models/layoutlm/modeling_layoutlm.py      |    2 +-
 src/transformers/models/led/modeling_led.py   |    2 +-
 .../models/m2m_100/modeling_m2m_100.py        |    2 +-
 .../models/m2m_100/tokenization_m2m_100.py    |   10 +-
 .../models/marian/modeling_marian.py          |    2 +-
 .../models/mbart/modeling_mbart.py            |    2 +-
 .../models/mbart/tokenization_mbart.py        |   10 +-
 .../models/mbart/tokenization_mbart50.py      |   10 +-
 .../models/mbart/tokenization_mbart50_fast.py |   32 -
 .../models/mbart/tokenization_mbart_fast.py   |   32 -
 .../models/megatron_bert/__init__.py          |   74 +
 .../configuration_megatron_bert.py            |  132 ++
 .../convert_megatron_bert_checkpoint.py       |  265 +++
 .../megatron_bert/modeling_megatron_bert.py   | 1827 +++++++++++++++++
 .../convert_megatron_gpt2_checkpoint.py       |  238 +++
 .../models/mpnet/tokenization_mpnet.py        |    9 +-
 .../models/pegasus/modeling_pegasus.py        |    2 +-
 .../models/phobert/tokenization_phobert.py    |    9 +-
 .../models/prophetnet/modeling_prophetnet.py  |    2 +-
 .../prophetnet/tokenization_prophetnet.py     |    9 +-
 src/transformers/models/rag/retrieval_rag.py  |   15 +-
 .../models/reformer/configuration_reformer.py |    2 +-
 .../models/roberta/modeling_roberta.py        |    2 +-
 .../models/roberta/tokenization_roberta.py    |    9 +-
 .../models/speech_to_text/__init__.py         |   18 +-
 .../feature_extraction_speech_to_text.py      |   12 +-
 .../speech_to_text/modeling_speech_to_text.py |    2 +-
 .../tokenization_speech_to_text.py            |   10 +-
 src/transformers/models/t5/tokenization_t5.py |   10 +-
 .../models/tapas/modeling_tapas.py            |    4 +-
 .../models/tapas/tokenization_tapas.py        |    9 +-
 src/transformers/models/vit/modeling_vit.py   |    2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |    2 +-
 .../models/xlm/modeling_tf_xlm.py             |    2 +-
 src/transformers/models/xlm/modeling_xlm.py   |    2 +-
 .../models/xlm/tokenization_xlm.py            |   12 +-
 .../tokenization_xlm_prophetnet.py            |    9 +-
 .../xlm_roberta/tokenization_xlm_roberta.py   |    9 +-
 .../tokenization_xlm_roberta_fast.py          |   31 -
 .../models/xlnet/tokenization_xlnet.py        |    9 +-
 .../models/xlnet/tokenization_xlnet_fast.py   |   31 -
 src/transformers/pipelines/__init__.py        |   12 +-
 src/transformers/pipelines/base.py            |   20 +-
 .../pipelines/table_question_answering.py     |    4 +-
 .../pipelines/zero_shot_classification.py     |    2 +-
 src/transformers/testing_utils.py             |   22 +
 src/transformers/tokenization_utils.py        |   10 +
 src/transformers/tokenization_utils_base.py   |    8 +-
 src/transformers/trainer.py                   |   97 +-
 src/transformers/trainer_callback.py          |    4 +-
 src/transformers/trainer_pt_utils.py          |    4 +-
 src/transformers/trainer_seq2seq.py           |    3 +
 src/transformers/training_args.py             |    6 +
 src/transformers/utils/__init__.py            |    4 +-
 src/transformers/utils/dummy_flax_objects.py  |   74 +-
 src/transformers/utils/dummy_pt_objects.py    | 1276 ++++++------
 .../dummy_sentencepiece_and_speech_objects.py |    7 +
 ...my_sentencepiece_and_tokenizers_objects.py |    9 +
 .../utils/dummy_sentencepiece_objects.py      |   75 +-
 .../utils/dummy_speech_objects.py             |    7 +
 src/transformers/utils/dummy_tf_objects.py    |  740 +++----
 .../utils/dummy_tokenizers_objects.py         |  141 +-
 .../utils/dummy_vision_objects.py             |    6 +-
 .../utils/modeling_auto_mapping.py            |    1 +
 src/transformers/utils/notebook.py            |   11 +-
 src/transformers/utils/versions.py            |   55 +-
 ...ng_{{cookiecutter.lowercase_modelname}}.py |    4 +-
 ...on_{{cookiecutter.lowercase_modelname}}.py |    9 +-
 .../{{cookiecutter.lowercase_modelname}}.rst  |    3 +-
 .../deepspeed/ds_config_zero2.json            |    2 +-
 tests/deepspeed/ds_config_zero3.json          |   48 +
 tests/deepspeed/test_deepspeed.py             |  637 ++++++
 .../extended}/test_trainer_ext.py             |    5 +-
 .../dummy_feature_extractor_config.json       |    3 +
 tests/sagemaker/README.md                     |    7 +-
 .../pytorch/run_glue_model_parallelism.py     |    4 +-
 .../test_multi_node_data_parallel.py          |   24 +-
 .../test_multi_node_model_parallel.py         |   35 +-
 tests/sagemaker/test_single_node_gpu.py       |   20 +-
 tests/test_data_collator.py                   |   56 +-
 tests/test_feature_extraction_auto.py         |   44 +
 .../test_feature_extraction_speech_to_text.py |    7 +-
 tests/test_logging.py                         |    4 +-
 tests/test_modeling_albert.py                 |    3 +-
 tests/test_modeling_auto.py                   |   32 +-
 tests/test_modeling_bert.py                   |    3 +-
 tests/test_modeling_big_bird.py               |   45 +-
 tests/test_modeling_common.py                 |   27 +-
 tests/test_modeling_convbert.py               |    3 +-
 tests/test_modeling_electra.py                |    3 +-
 tests/test_modeling_flax_bert.py              |    2 +
 tests/test_modeling_funnel.py                 |    3 +-
 tests/test_modeling_gpt_neo.py                |  191 +-
 tests/test_modeling_led.py                    |    3 +-
 tests/test_modeling_lxmert.py                 |    5 +-
 tests/test_modeling_megatron_bert.py          |  378 ++++
 tests/test_modeling_mobilebert.py             |    3 +-
 tests/test_modeling_tapas.py                  |   19 +-
 tests/test_modeling_tf_albert.py              |    3 +-
 tests/test_modeling_tf_auto.py                |   30 +-
 tests/test_modeling_tf_bert.py                |    3 +-
 tests/test_modeling_tf_common.py              |   31 +-
 tests/test_processor_speech_to_text.py        |    6 +-
 tests/test_tokenization_cpm.py                |   39 +
 tests/test_trainer.py                         |    1 +
 tests/test_trainer_callback.py                |    2 +-
 tests/test_versions_utils.py                  |    7 +-
 utils/check_dummies.py                        |   45 +-
 utils/check_inits.py                          |   35 +-
 utils/check_repo.py                           |   65 +-
 201 files changed, 9533 insertions(+), 2913 deletions(-)
 create mode 100644 docs/source/model_doc/cpm.rst
 create mode 100644 docs/source/model_doc/megatron_bert.rst
 create mode 100644 docs/source/model_doc/megatron_gpt2.rst
 create mode 100644 examples/question-answering/run_qa_beam_search_no_trainer.py
 create mode 100755 examples/question-answering/run_qa_no_trainer.py
 delete mode 100644 examples/tests/deepspeed/test_deepspeed.py
 create mode 100644 src/transformers/models/auto/feature_extraction_auto.py
 create mode 100644 src/transformers/models/cpm/__init__.py
 create mode 100644 src/transformers/models/cpm/tokenization_cpm.py
 create mode 100644 src/transformers/models/megatron_bert/__init__.py
 create mode 100644 src/transformers/models/megatron_bert/configuration_megatron_bert.py
 create mode 100644 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
 create mode 100755 src/transformers/models/megatron_bert/modeling_megatron_bert.py
 create mode 100644 src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
 create mode 100644 src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
 create mode 100644 src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
 create mode 100644 src/transformers/utils/dummy_speech_objects.py
 rename examples/tests/deepspeed/ds_config.json => tests/deepspeed/ds_config_zero2.json (96%)
 create mode 100644 tests/deepspeed/ds_config_zero3.json
 create mode 100644 tests/deepspeed/test_deepspeed.py
 rename {examples/tests/trainer => tests/extended}/test_trainer_ext.py (98%)
 create mode 100644 tests/fixtures/dummy_feature_extractor_config.json
 create mode 100644 tests/test_feature_extraction_auto.py
 create mode 100644 tests/test_modeling_megatron_bert.py
 create mode 100644 tests/test_tokenization_cpm.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 999af392fbb3ca..4b490f3259e348 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -145,7 +145,7 @@ jobs:
                   key: v0.4-torch-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
+            - run: python -m pytest -n 4 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
             - store_artifacts:
                   path: ~/transformers/tests_output.txt
             - store_artifacts:
@@ -277,7 +277,7 @@ jobs:
                       - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
-            - run: pip install .[ja,testing,sentencepiece]
+            - run: pip install .[ja,testing,sentencepiece,jieba]
             - run: python -m unidic download
             - save_cache:
                   key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@@ -348,7 +348,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: pip install ."[all, docs]"
+            - run: pip install ."[docs]"
             - save_cache:
                   key: v0.4-build_doc-{{ checksum "setup.py" }}
                   paths:
@@ -370,7 +370,7 @@ jobs:
                   keys:
                       - v0.4-deploy_doc-{{ checksum "setup.py" }}
                       - v0.4-{{ checksum "setup.py" }}
-            - run: pip install ."[all,docs]"
+            - run: pip install ."[docs]"
             - save_cache:
                   key: v0.4-deploy_doc-{{ checksum "setup.py" }}
                   paths:
diff --git a/.circleci/deploy.sh b/.circleci/deploy.sh
index 8c99d89cad61c4..f66bf3cbe35976 100755
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -60,4 +60,5 @@ deploy_doc "7d9a9d0" v4.2.2
 deploy_doc "bae0c79" v4.3.3
 deploy_doc "c988db5" v4.4.0
 deploy_doc "c5d6a28" v4.4.1
-deploy_doc "6bc89ed"  # v4.4.2 Latest stable release
\ No newline at end of file
+deploy_doc "6bc89ed" v4.4.2
+deploy_doc "4906a29"  # v4.5.0 Latest stable release
\ No newline at end of file
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 3b72baea0d2b76..978d9e02a69d38 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -33,7 +33,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,deepspeed]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -155,7 +155,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,deepspeed,fairscale]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
diff --git a/.gitignore b/.gitignore
index 36cbb4f7ea399f..965fbeec77f51d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,8 +9,7 @@ __pycache__/
 *.so
 
 # tests and logs
-tests/fixtures/*
-!tests/fixtures/sample_text_no_unicode.txt
+tests/fixtures/cached_*_text.txt
 logs/
 lightning_logs/
 lang_code_data/
diff --git a/README.md b/README.md
index dd535688cb9333..18b2eff45b6cdf 100644
--- a/README.md
+++ b/README.md
@@ -200,6 +200,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
@@ -223,6 +224,8 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
index f8cc2db044c5bd..5fdab31a04dfc6 100644
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,11 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.4.2"
+const stableVersion = "v4.5.0"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
     "master": "master",
-    "": "v4.4.0/v4.4.1/v4.4.2 (stable)",
+    "": "v4.5.0 (stable)",
+    "v4.4.2": "v4.4.0/v4.4.1/v4.4.2",
     "v4.3.3": "v4.3.0/v4.3.1/v4.3.2/v4.3.3",
     "v4.2.2": "v4.2.0/v4.2.1/v4.2.2",
     "v4.1.1": "v4.1.0/v4.1.1",
@@ -62,7 +63,7 @@ function addIcon() {
 function addCustomFooter() {
     const customFooter = document.createElement("div");
     const questionOrIssue = document.createElement("div");
-    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
+    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://huggingface.co/blog'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
     customFooter.appendChild(questionOrIssue);
     customFooter.classList.add("footer");
 
diff --git a/docs/source/community.md b/docs/source/community.md
index 4a6e39a76a5058..e1b467863df15e 100644
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -51,3 +51,4 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
 |[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
 |[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 81c93caa0ab070..207ca9e8a57653 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,23 +14,24 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../../src'))
+
+sys.path.insert(0, os.path.abspath("../../src"))
 
 
 # -- Project information -----------------------------------------------------
 
-project = u'transformers'
-copyright = u'2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0'
-author = u'huggingface'
+project = "transformers"
+copyright = "2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0"
+author = "huggingface"
 
 # The short X.Y version
-version = u''
+version = ""
 # The full version, including alpha/beta/rc tags
-release = u'4.5.0.dev0'
+release = "4.5.0.dev0"
 
 
 # Prefix link to point to master, comment this during version release and uncomment below line
-extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/master/%s', '')}
+extlinks = {"prefix_link": ("https://github.com/huggingface/transformers/blob/master/%s", "")}
 # Prefix link to always point to corresponding version, uncomment this during version release
 # extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/v'+ release + '/%s', '')}
 
@@ -44,27 +45,28 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.extlinks',
-    'sphinx.ext.coverage',
-    'sphinx.ext.napoleon',
-    'recommonmark',
-    'sphinx.ext.viewcode',
-    'sphinx_markdown_tables',
-    'sphinx_copybutton'
+    "sphinx.ext.autodoc",
+    "sphinx.ext.extlinks",
+    "sphinx.ext.coverage",
+    "sphinx.ext.napoleon",
+    "recommonmark",
+    "sphinx.ext.viewcode",
+    "sphinx_markdown_tables",
+    "sphinxext.opengraph",
+    "sphinx_copybutton",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 # source_suffix = '.rst'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -76,7 +78,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None
@@ -90,21 +92,30 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-html_theme_options = {
-    'analytics_id': 'UA-83738774-2',
-    'navigation_with_keys': True
-}
+html_theme_options = {"analytics_id": "UA-83738774-2", "navigation_with_keys": True}
+
+#  Configuration for OpenGraph and Twitter Card Tags.
+# These are responsible for creating nice shareable social images https://ahrefs.com/blog/open-graph-meta-tags/
+# https://ogp.me/#type_website
+ogp_image = "https://huggingface.co/front/thumbnails/transformers.png"
+ogp_description = "State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0. Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone"
+ogp_description_length = 160
+
+ogp_custom_meta_tags = [
+    f'<meta name="twitter:image" content="{ogp_image}">',
+    f'<meta name="twitter:description" content="{ogp_description}">',
+]
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -116,17 +127,17 @@
 #
 # html_sidebars = {}
 
-# This must be the name of an image file (path relative to the configuration 
-# directory) that is the favicon of the docs. Modern browsers use this as 
-# the icon for tabs, windows and bookmarks. It should be a Windows-style 
+# This must be the name of an image file (path relative to the configuration
+# directory) that is the favicon of the docs. Modern browsers use this as
+# the icon for tabs, windows and bookmarks. It should be a Windows-style
 # icon file (.ico).
-html_favicon = 'favicon.ico'
+html_favicon = "favicon.ico"
 
 
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'transformersdoc'
+htmlhelp_basename = "transformersdoc"
 
 
 # -- Options for LaTeX output ------------------------------------------------
@@ -135,15 +146,12 @@
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',
-
     # The font size ('10pt', '11pt' or '12pt').
     #
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     #
     # 'preamble': '',
-
     # Latex figure (float) alignment
     #
     # 'figure_align': 'htbp',
@@ -153,8 +161,7 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'transformers.tex', u'transformers Documentation',
-     u'huggingface', 'manual'),
+    (master_doc, "transformers.tex", "transformers Documentation", "huggingface", "manual"),
 ]
 
 
@@ -162,10 +169,7 @@
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'transformers', u'transformers Documentation',
-     [author], 1)
-]
+man_pages = [(master_doc, "transformers", "transformers Documentation", [author], 1)]
 
 
 # -- Options for Texinfo output ----------------------------------------------
@@ -174,9 +178,15 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'transformers', u'transformers Documentation',
-     author, 'transformers', 'One line description of project.',
-     'Miscellaneous'),
+    (
+        master_doc,
+        "transformers",
+        "transformers Documentation",
+        author,
+        "transformers",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]
 
 
@@ -195,11 +205,13 @@
 # epub_uid = ''
 
 # A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
+epub_exclude_files = ["search.html"]
+
 
 def setup(app):
-    app.add_css_file('css/huggingface.css')
-    app.add_css_file('css/code-snippets.css')
-    app.add_js_file('js/custom.js')
+    app.add_css_file("css/huggingface.css")
+    app.add_css_file("css/code-snippets.css")
+    app.add_js_file("js/custom.js")
+
 
 # -- Extension configuration -------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9692abcde9986d..ebf09989e682e3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -114,122 +114,133 @@ and conversion utilities for the following models:
 11. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
     Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
     Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-12. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+12. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
+    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
+    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
+    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
+    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+13. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
     Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
     Lav R. Varshney, Caiming Xiong and Richard Socher.
-13. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
+14. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
     Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
     Chen.
-14. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
+15. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
     with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
     Weizhu Chen.
-15. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+16. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
     Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
     Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-16. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+17. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
     distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
     Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
     <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
     <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
     `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
     version of DistilBERT.
-17. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+18. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
     Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
     Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-18. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+19. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
     Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
     Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-19. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+20. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
     Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
     Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-20. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+21. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
     Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
     Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-21. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+22. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
     Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
     and Ilya Sutskever.
-22. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+23. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
     Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
     Luan, Dario Amodei** and Ilya Sutskever**.
-23. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+24. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
     <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-24. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+25. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
     <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-25. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+26. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
     of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
     Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-26. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+27. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
     <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-27. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+28. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
     Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-28. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+29. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
     Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
     by Hao Tan and Mohit Bansal.
-29. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+30. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
     Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
     Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
     Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-30. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+31. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
     Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
     Translator Team.
-31. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+32. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
     Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
     Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-32. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+33. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
     Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
     Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-33. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+34. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
+    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+35. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
+    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+36. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
     Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
     Jianfeng Lu, Tie-Yan Liu.
-34. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+37. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
     text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
     Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-35. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+38. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
     Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
     Mohammad Saleh and Peter J. Liu.
-36. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+39. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
     Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
     Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-37. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+40. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
     Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-38. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+41. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
     Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
     Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-39. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+42. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
     `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
     Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-40. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+43. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
     about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
     Krishna, and Kurt W. Keutzer.
-41. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+44. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
     Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
     Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-42. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+45. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
     Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
     Francesco Piccinno and Julian Martin Eisenschlos.
-43. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+46. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
     Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
     Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-44. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+47. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
     Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
     Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
     Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-45. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+48. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
     Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
     Zhou, Abdelrahman Mohamed, Michael Auli.
-46. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+49. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
     Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-47. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+50. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
     Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
     Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-48. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+51. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
     Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
     Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
     Zettlemoyer and Veselin Stoyanov.
-49. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
+52. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
     Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
     Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-50. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+53. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
     Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
     Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 
@@ -304,6 +315,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -429,6 +442,7 @@ TensorFlow and/or Flax.
     model_doc/bort
     model_doc/camembert
     model_doc/convbert
+    model_doc/cpm
     model_doc/ctrl
     model_doc/deberta
     model_doc/deberta_v2
@@ -449,6 +463,8 @@ TensorFlow and/or Flax.
     model_doc/marian
     model_doc/m2m_100
     model_doc/mbart
+    model_doc/megatron_bert
+    model_doc/megatron_gpt2
     model_doc/mobilebert
     model_doc/mpnet
     model_doc/mt5
diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index d50a6664d3fc65..aae325076cec8a 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -134,6 +134,8 @@ Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, O
 
 This provided support is new and experimental as of this writing.
 
+.. _zero-install-notes:
+
 Installation Notes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -156,7 +158,8 @@ please, read the following notes first.
 In these notes we give examples for what to do when ``pytorch`` has been built with CUDA ``10.2``. If your situation is
 different remember to adjust the version number to the one you are after.
 
-**Possible problem #1:**
+Possible problem #1
+=======================================================================================================================
 
 While, Pytorch comes with its own CUDA toolkit, to build these two projects you must have an identical version of CUDA
 installed system-wide.
@@ -176,7 +179,8 @@ If you don't have CUDA installed system-wide, install it first. You will find th
 search engine. For example, if you're on Ubuntu you may want to search for: `ubuntu cuda 10.2 install
 <https://www.google.com/search?q=ubuntu+cuda+10.2+install>`__.
 
-**Possible problem #2:**
+Possible problem #2
+=======================================================================================================================
 
 Another possible common problem is that you may have more than one CUDA toolkit installed system-wide. For example you
 may have:
@@ -222,7 +226,8 @@ exist. ``lib64`` sub-directory is where the various CUDA ``.so`` objects, like `
 that your system will have it named differently, but if it is adjust it to reflect your reality.
 
 
-**Possible problem #3:**
+Possible problem #3
+=======================================================================================================================
 
 Some older CUDA versions may refuse to build with newer compilers. For example, you my have ``gcc-9`` but it wants
 ``gcc-7``.
@@ -247,13 +252,6 @@ should find ``gcc-7`` (and ``g++7``) and then the build will succeed.
 
 As always make sure to edit the paths in the example to match your situation.
 
-**If still unsuccessful:**
-
-If after addressing these you still encounter build issues, please, proceed with the GitHub Issue of `FairScale
-<https://github.com/facebookresearch/fairscale/issues>`__ and `Deepspeed
-<https://github.com/microsoft/DeepSpeed/issues>`__, depending on the project you have the problem with.
-
-
 FairScale
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -267,20 +265,74 @@ provides support for the following features from `the ZeRO paper <https://arxiv.
 
 You will need at least two GPUs to use this feature.
 
-To deploy this feature:
 
-1. Install the library via pypi:
+**Installation**:
 
-   .. code-block:: bash
+Install the library via pypi:
+
+.. code-block:: bash
+
+    pip install fairscale
+
+or via ``transformers``' ``extras``:
+
+.. code-block:: bash
+
+    pip install transformers[fairscale]
+
+(will become available starting from ``transformers==4.6.0``)
+
+or find more details on `the FairScale's GitHub page <https://github.com/facebookresearch/fairscale/#installation>`__.
+
+If you're still struggling with the build, first make sure to read :ref:`zero-install-notes`.
+
+If it's still not resolved the build issue, here are a few more ideas.
+
+``fairscale`` seems to have an issue with the recently introduced by pip build isolation feature. If you have a problem
+with it, you may want to try one of:
+
+.. code-block:: bash
+
+   pip install fairscale --no-build-isolation .
+
+or:
+
+.. code-block:: bash
+
+   git clone https://github.com/facebookresearch/fairscale/
+   cd fairscale
+   rm -r dist build
+   python setup.py bdist_wheel
+   pip uninstall -y fairscale
+   pip install dist/fairscale-*.whl
+
+``fairscale`` also has issues with building against pytorch-nightly, so if you use it you may have to try one of:
+
+.. code-block:: bash
 
-       pip install fairscale
+   pip uninstall -y fairscale; pip install fairscale --pre \
+   -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html \
+   --no-cache --no-build-isolation
 
-   or find more details on `the FairScale's GitHub page
-   <https://github.com/facebookresearch/fairscale/#installation>`__.
+or:
+
+.. code-block:: bash
+
+   pip install -v --disable-pip-version-check . \
+   -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html --pre
+
+Of course, adjust the urls to match the cuda version you use.
+
+If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of
+`FairScale <https://github.com/facebookresearch/fairscale/issues>`__.
+
+
+
+**Usage**:
 
-2. To use the first version of Sharded data-parallelism, add ``--sharded_ddp simple`` to the command line arguments,
-   and make sure you have added the distributed launcher ``-m torch.distributed.launch
-   --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
+To use the first version of Sharded data-parallelism, add ``--sharded_ddp simple`` to the command line arguments, and
+make sure you have added the distributed launcher ``-m torch.distributed.launch
+--nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
 
 For example here is how you could use it for ``run_translation.py`` with 2 GPUs:
 
@@ -303,9 +355,9 @@ Notes:
   able to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
   significantly shorter training time.
 
-3. To use the second version of Sharded data-parallelism, add ``--sharded_ddp zero_dp_2`` or ``--sharded_ddp zero_dp_3`
-   to the command line arguments, and make sure you have added the distributed launcher ``-m torch.distributed.launch
-   --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
+3. To use the second version of Sharded data-parallelism, add ``--sharded_ddp zero_dp_2`` or ``--sharded_ddp
+   zero_dp_3`` to the command line arguments, and make sure you have added the distributed launcher ``-m
+   torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
 
 For example here is how you could use it for ``run_translation.py`` with 2 GPUs:
 
@@ -346,19 +398,23 @@ DeepSpeed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 `DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ implements everything described in the `ZeRO paper
-<https://arxiv.org/abs/1910.02054>`__, except ZeRO's stage 3. "Parameter Partitioning (Pos+g+p)". Currently it provides
-full support for:
+<https://arxiv.org/abs/1910.02054>`__. Currently it provides full support for:
 
 1. Optimizer State Partitioning (ZeRO stage 1)
-2. Add Gradient Partitioning (ZeRO stage 2)
-3. Custom fp16 handling
-4. A range of fast Cuda-extension-based Optimizers
-5. ZeRO-Offload
+2. Gradient Partitioning (ZeRO stage 2)
+3. Param Partitioning (ZeRO stage 3)
+4. Custom mixed precision training handling
+5. A range of fast CUDA-extension-based Optimizers
+6. ZeRO-Offload
 
 ZeRO-Offload has its own dedicated paper: `ZeRO-Offload: Democratizing Billion-Scale Model Training
 <https://arxiv.org/abs/2101.06840>`__.
 
-DeepSpeed is currently used only for training, as all the currently available features are of no use to inference.
+DeepSpeed ZeRO-2 is currently used only for training, as all the currently available features are of no use to
+inference.
+
+DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which
+won't be possible on a single GPU.
 
 
 
@@ -371,7 +427,82 @@ Install the library via pypi:
 
     pip install deepspeed
 
-or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__.
+or via ``transformers``' ``extras``:
+
+.. code-block:: bash
+
+    pip install transformers[deepspeed]
+
+(will become available starting from ``transformers==4.6.0``)
+
+or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__ and
+`advanced install <https://www.deepspeed.ai/tutorials/advanced-install/>`__.
+
+If you're still struggling with the build, first make sure to read :ref:`zero-install-notes`.
+
+If you don't prebuild the extensions and rely on them to be built at run time and you tried all of the above solutions
+to no avail, the next thing to try is to pre-build the modules before installing them.
+
+To make a local build for DeepSpeed:
+
+.. code-block:: bash
+
+   git clone https://github.com/microsoft/DeepSpeed/
+   cd DeepSpeed
+   rm -rf build
+   TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 pip install . \
+   --global-option="build_ext" --global-option="-j8" --no-cache -v \
+   --disable-pip-version-check 2>&1 | tee build.log
+
+Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use.
+
+Or if you need to use the same setup on multiple machines, make a binary wheel:
+
+.. code-block:: bash
+
+   git clone https://github.com/microsoft/DeepSpeed/
+   cd DeepSpeed
+   rm -rf build
+   TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 \
+   python setup.py build_ext -j8 bdist_wheel
+
+it will generate something like ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` which now you can install
+as ``pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` locally or on any other machine.
+
+Again, remember to ensure to adjust ``TORCH_CUDA_ARCH_LIST`` to the target architectures.
+
+You can find the complete list of NVIDIA GPUs and their corresponding **Compute Capabilities** (same as arch in this
+context) `here <https://developer.nvidia.com/cuda-gpus>`__.
+
+You can check the archs pytorch was built with using:
+
+.. code-block:: bash
+
+   python -c "import torch; print(torch.cuda.get_arch_list())"
+
+Here is how to find out the arch for one of the installed GPU. For example, for GPU 0:
+
+.. code-block:: bash
+
+   CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
+   print(torch.cuda.get_device_properties(torch.device('cuda')))"
+
+If the output is:
+
+.. code-block:: bash
+
+   _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)
+
+then you know that this card's arch is ``8.6``.
+
+You can also leave ``TORCH_CUDA_ARCH_LIST`` out completely and then the build program will automatically query the
+architecture of the GPUs the build is made on. This may or may not match the GPUs on the target machines, that's why
+it's best to specify the desired archs explicitly.
+
+If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of
+`Deepspeed <https://github.com/microsoft/DeepSpeed/issues>`__,
+
+
 
 Deployment with multiple GPUs
 =======================================================================================================================
@@ -410,7 +541,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a
 .. code-block:: bash
 
     deepspeed examples/seq2seq/run_translation.py \
-    --deepspeed examples/tests/deepspeed/ds_config.json \
+    --deepspeed tests/deepspeed/ds_config.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -435,7 +566,7 @@ To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` comma
 .. code-block:: bash
 
     deepspeed --num_gpus=1 examples/seq2seq/run_translation.py \
-    --deepspeed examples/tests/deepspeed/ds_config.json \
+    --deepspeed tests/deepspeed/ds_config.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -498,7 +629,7 @@ Deployment in Notebooks
 The problem with running notebook cells as a script is that there is no normal ``deepspeed`` launcher to rely on, so
 under certain setups we have to emulate it.
 
-Here is how you'd have to adjust your training code in the notebook to use DeepSpeed.
+If you're using only 1 GPU, here is how you'd have to adjust your training code in the notebook to use DeepSpeed.
 
 .. code-block:: python
 
@@ -516,7 +647,11 @@ Here is how you'd have to adjust your training code in the notebook to use DeepS
     trainer = Trainer(...)
     trainer.train()
 
-Note: `...` stands for the normal arguments that you'd pass to the functions.
+Note: ``...`` stands for the normal arguments that you'd pass to the functions.
+
+If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. That is, you have
+to use the launcher for that purpose and this cannot be accomplished by emulating the distributed environment presented
+at the beginning of this section.
 
 If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated
 cell with:
@@ -570,22 +705,30 @@ cell with:
     EOT
 
 
-That's said if the script is not in the notebook cells, you can launch ``deepspeed`` normally via shell from a cell
-with:
+If the training script is in a normal file and not in the notebook cells, you can launch ``deepspeed`` normally via
+shell from a cell. For example, to use ``run_translation.py`` you would launch it with:
 
 .. code-block::
 
-   !deepspeed examples/seq2seq/run_translation.py ...
+   !git clone https://github.com/huggingface/transformers
+   !cd transformers; deepspeed examples/seq2seq/run_translation.py ...
 
-or with bash magic, where you can write a multi-line code for the shell to run:
+or with ``%%bash`` magic, where you can write a multi-line code for the shell program to run:
 
 .. code-block::
 
    %%bash
 
-   cd /somewhere
+   git clone https://github.com/huggingface/transformers
+   cd transformers
    deepspeed examples/seq2seq/run_translation.py ...
 
+In such case you don't need any of the code presented at the beginning of this section.
+
+Note: ``%%bash`` magic is neat, but currently it buffers the output so you won't see the logs until the process
+completes.
+
+
 
 
 
@@ -717,26 +860,45 @@ Of course, you will need to adjust the values in this example to your situation.
 ZeRO
 =======================================================================================================================
 
+`Zero Redundancy Optimizer (ZeRO) <https://www.deepspeed.ai/tutorials/zero/>`__ is the work horse of DeepSpeed. It
+support 3 different levels (stages) of optimization. The first one is not quite interesting for scalability purposes,
+therefore this document focuses on stages 2 and 3. You will find more indepth information in the DeepSpeed
+documentation.
+
 The ``zero_optimization`` section of the configuration file is the most important part (`docs
 <https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`__), since that is where you define
-which ZeRO stages you want to enable and how to configure them.
+which ZeRO stages you want to enable and how to configure them. You will find the explanation for each parameter in the
+DeepSpeed docs.
+
+This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
+no equivalent command line arguments.
+
+Note: currently DeepSpeed doesn't validate parameter names, so if you misspell any, it'll use the default setting for
+the parameter that got misspelled. You can watch the DeepSpeed engine start up log messages to see what values it is
+going to use.
+
+
+ZeRO-2 Config
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+The following is an example configuration for ZeRO stage 2:
 
 .. code-block:: json
 
     {
-       "zero_optimization": {
-           "stage": 2,
-           "allgather_partitions": true,
-           "allgather_bucket_size": 5e8,
-           "overlap_comm": true,
-           "reduce_scatter": true,
-           "reduce_bucket_size": 5e8,
-           "contiguous_gradients": true,
-           "cpu_offload": true
-       }
+        "zero_optimization": {
+            "stage": 2,
+            "allgather_partitions": true,
+            "allgather_bucket_size": 5e8,
+            "overlap_comm": true,
+            "reduce_scatter": true,
+            "reduce_bucket_size": 5e8,
+            "contiguous_gradients": true,
+            "cpu_offload": true
+        }
     }
 
-Notes:
+**Performance tuning:**
 
 - enabling ``cpu_offload`` should reduce GPU RAM usage (it requires ``"stage": 2``)
 - ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x
@@ -748,10 +910,218 @@ Notes:
   the slower the communication, and the more GPU RAM will be available to other tasks. So if a bigger batch size is
   important, getting a slightly slower training time could be a good trade.
 
-This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
-no equivalent command line arguments.
+
+ZeRO-3 Config
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+The following is an example configuration for ZeRO stage 3:
 
 
+.. code-block:: json
+
+    {
+        "zero_optimization": {
+            "stage": 3,
+            "cpu_offload": true,
+            "cpu_offload_params": true,
+            "cpu_offload_use_pin_memory" : true,
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": 1e6,
+            "stage3_prefetch_bucket_size": 0.94e6,
+            "stage3_param_persistence_threshold": 1e4,
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        }
+    }
+
+Note: if you're migrating from ZeRO-2 configuration that: ``allgather_partitions``, ``allgather_bucket_size`` and
+``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these they will just be ignored.
+
+**Performance tuning:**
+
+- ``sub_group_size``: ``1e14``
+- ``reduce_bucket_size``: ``hidden_size*hidden_size``
+- ``stage3_prefetch_bucket_size``: ``0.9 * hidden_size * hidden_size``
+- ``stage3_param_persistence_threshold``: ``10 * hidden_size``
+- ``stage3_max_live_parameters``: ``1e9``
+- ``stage3_max_reuse_distance``: ``1e9``
+
+If hitting OOM reduce ``stage3_max_live_parameters`` and ``stage3_max_reuse_distance``. They should have minimal impact
+on performance unless you are doing activation checkpointing. ``1e9`` would consume ~2GB. The memory is shared by
+``stage3_max_live_parameters`` and ``stage3_max_reuse_distance``, so its not additive, its just 2GB total.
+
+``stage3_max_live_parameters`` is the upper limit on how many full parameters you want to keep on the GPU at any given
+time. "reuse distance" is a metric we are using to figure out when will a parameter be used again in the future, and we
+use the ``stage3_max_reuse_distance`` to decide whether to throw away the parameter or to keep it. If a parameter is
+going to be used again in near future (less than ``stage3_max_reuse_distance``) then we keep it to reduce communication
+overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and
+backward passes a a single layer granularity and want to keep the parameter in the forward recompute till the backward
+
+If you set ``reduce_bucket_size``, ``stage3_prefetch_bucket_size`` and ``stage3_param_persistence_threshold`` as
+recommended above, they will already be fairly small so you won't have to tune those much.
+
+Since ``hidden_size`` varies from model to model, the ``Trainer`` will automatically set the needed value for the 3
+config parameters that contain that variable (using ``model.config.hidden_size``). Just set these values to ``0`` as
+shown below and the right configuration will be passed to DeepSpeed:
+
+.. code-block:: json
+
+    {
+        "zero_optimization": {
+            "stage": 3,
+            "cpu_offload": true,
+            "cpu_offload_params": true,
+            "cpu_offload_use_pin_memory" : true,
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": 0,
+            "stage3_prefetch_bucket_size": 0,
+            "stage3_param_persistence_threshold": 0,
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        }
+    }
+
+``stage3_gather_fp16_weights_on_model_save`` enables model fp16 weights consolidation when model gets saved. With large
+models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if
+you plan to resume the training. Watch out for future updates that will remove this limitation and make things more
+flexible.
+
+
+ZeRO-2 vs ZeRO-3 Performance
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+ZeRO-3 is likely to be slower than ZeRO-2 if everything else is configured the same because the former has to gather
+model weights in addition to what ZeRO-2 does. If ZeRO-2 meets your needs and you don't need to scale beyond a few GPUs
+then you may choose to stick to it. It's important to understand that ZeRO-3 enables a much higher scalability capacity
+at a cost of speed.
+
+It's possible to adjust ZeRO-3 configuration to make it perform closer to ZeRO-2:
+
+- set ``stage3_param_persistence_threshold`` to a very large number - larger than the largest parameter, e.g., ``6 *
+  hidden_size * hidden_size``. This will keep the parameters on the GPUs.
+- turn off ``cpu_offload_params`` since ZeRO-2 doesn't have that option.
+
+The performance will likely improve significantly with just ``cpu_offload_params`` turned off, even if you don't change
+``stage3_param_persistence_threshold``. Of course, these changes will impact the size of the model you can train. So
+these help you to trade scalability for speed depending on your needs.
+
+
+
+ZeRO-2 Example
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``:
+
+.. code-block:: json
+
+   {
+       "fp16": {
+           "enabled": true,
+           "loss_scale": 0,
+           "loss_scale_window": 1000,
+           "initial_scale_power": 16,
+           "hysteresis": 2,
+           "min_loss_scale": 1
+       },
+
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 2e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 2e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       },
+
+       "optimizer": {
+           "type": "AdamW",
+           "params": {
+               "lr": 3e-5,
+               "betas": [0.8, 0.999],
+               "eps": 1e-8,
+               "weight_decay": 3e-7
+           }
+       },
+
+       "scheduler": {
+           "type": "WarmupLR",
+           "params": {
+               "warmup_min_lr": 0,
+               "warmup_max_lr": 3e-5,
+               "warmup_num_steps": 500
+           }
+       },
+
+       "steps_per_print": 2000,
+       "wall_clock_breakdown": false
+   }
+
+
+
+ZeRO-3 Example
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``:
+
+.. code-block:: json
+
+   {
+       "fp16": {
+           "enabled": true,
+           "loss_scale": 0,
+           "loss_scale_window": 1000,
+           "initial_scale_power": 16,
+           "hysteresis": 2,
+           "min_loss_scale": 1
+       },
+
+       "zero_optimization": {
+           "stage": 3,
+           "cpu_offload": true,
+           "cpu_offload_params": true,
+           "cpu_offload_use_pin_memory" : true,
+           "overlap_comm": true,
+           "contiguous_gradients": true,
+           "sub_group_size": 1e14,
+           "reduce_bucket_size": 1e6,
+           "stage3_prefetch_bucket_size": 0.94e6,
+           "stage3_param_persistence_threshold": 1e4,
+           "stage3_max_live_parameters": 1e9,
+           "stage3_max_reuse_distance": 1e9,
+           "stage3_gather_fp16_weights_on_model_save": true
+       },
+
+       "optimizer": {
+           "type": "AdamW",
+           "params": {
+               "lr": 3e-5,
+               "betas": [0.8, 0.999],
+               "eps": 1e-8,
+               "weight_decay": 3e-7
+           }
+       },
+
+       "scheduler": {
+           "type": "WarmupLR",
+           "params": {
+               "warmup_min_lr": 0,
+               "warmup_max_lr": 3e-5,
+               "warmup_num_steps": 500
+           }
+       },
+
+       "steps_per_print": 2000,
+       "wall_clock_breakdown": false
+   }
+
 
 Optimizer and Scheduler
 =======================================================================================================================
@@ -772,7 +1142,7 @@ If ``cpu_offload`` is enabled you must use both DeepSpeed scheduler and DeepSpee
 
 
 Optimizer
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 
 DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
@@ -818,7 +1188,7 @@ make sure to adjust the values. e.g. if use Adam you will want ``weight_decay``
 
 
 Scheduler
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
 <https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
@@ -886,11 +1256,7 @@ and ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be corr
 Automatic Mixed Precision
 =======================================================================================================================
 
-You can work with FP16 in one of the following ways:
-
-1. Pytorch native amp, as documented `here <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
-2. NVIDIA's apex, as documented `here
-   <https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
+You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way:
 
 If you want to use an equivalent of the Pytorch native amp, you can either configure the ``fp16`` entry in the
 configuration file, or use the following command line arguments: ``--fp16 --fp16_backend amp``.
@@ -909,6 +1275,8 @@ Here is an example of the ``fp16`` configuration:
         },
     }
 
+Here is the `documentation <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
+
 If you want to use NVIDIA's apex instead, you can can either configure the ``amp`` entry in the configuration file, or
 use the following command line arguments: ``--fp16 --fp16_backend apex --fp16_opt_level 01``.
 
@@ -923,6 +1291,9 @@ Here is an example of the ``amp`` configuration:
         }
     }
 
+Here is the `documentation
+<https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
+
 
 Gradient Accumulation
 =======================================================================================================================
@@ -935,12 +1306,12 @@ While normally DeepSpeed gets gradient accumulation configured with:
         "gradient_accumulation_steps": 3,
     }
 
-in this case, to enable gradient accumulation, pass the command line `--gradient_accumulation_steps` argument as normal
-and it will get injected into the DeepSpeed configuration.
+in this case, to enable gradient accumulation, pass the command line ``--gradient_accumulation_steps 3`` argument as
+normal and it will get injected into the DeepSpeed configuration.
 
-If you try to add it directly to the configuration file, you will receive an error from the Trainer - this is because
-this setting is needed by the Trainer too, and so this approach ensures that there is a single way of setting this
-value and thus avoid potential subtle errors.
+If you try to add it directly to the configuration file, you will receive an error from the ``Trainer`` - this is
+because this setting is needed by the ``Trainer`` too, and so this approach ensures that there is a single way of
+setting this value and thus avoid potential subtle errors.
 
 
 
@@ -963,6 +1334,175 @@ Here is an example of the ``gradient_clipping`` configuration:
 
 
 
+Getting the model weights out
+=======================================================================================================================
+
+As long as you continue training and resuming using DeepSpeed you don't need to worry about anything. DeepSpeed stores
+fp32 master weights in its custom checkpoint optimizer files, which are ``global_step*/*optim_states.pt`` (this is glob
+pattern), and are saved under the normal checkpoint.
+
+**FP16 Weights:**
+
+When a model is saved under ZeRO-2, you end up having the normal ``pytorch_model.bin`` file with the model weights, but
+they are only the fp16 version of the weights.
+
+Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs,
+therefore ``"stage3_gather_fp16_weights_on_model_save": true`` is required to get the ``Trainer`` to save the fp16
+version of the weights. If this setting is ``False`` ``pytorch_model.bin`` won't be created. This is because by default
+DeepSpeed's ``state_dict`` contains a placeholder and not the real weights. If we were to save this ``state_dict`` it
+won't be possible to load it back.
+
+**FP32 Weights:**
+
+While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to
+the `models hub <https://huggingface.co/models>`__ or pass it to someone else you most likely will want to get the fp32
+weights. This cannot be done during training since this is a process that requires a lot of memory, and therefore this
+is performed offline.
+
+DeepSpeed creates a special conversion script ``zero_to_fp32.py`` which it places in the top-level of the checkpoint
+folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to
+have the configuration file or a ``Trainer`` to do the extraction.
+
+Let's say your checkpoint folder looks like this:
+
+.. code-block:: bash
+
+   $ ls -l output_dir/checkpoint-1/
+   -rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
+   drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
+   -rw-rw-r-- 1 stas stas   12 Mar 27 13:16 latest
+   -rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
+   -rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
+   -rw-rw-r-- 1 stas stas  623 Mar 27 20:42 scheduler.pt
+   -rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
+   -rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
+   -rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
+   -rw-rw-r-- 1 stas stas  339 Mar 27 20:42 trainer_state.json
+   -rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
+   -rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
+
+In this example there is just one DeepSpeed checkpoint sub-folder `global_step1`. Therefore to reconstruct the fp32
+weights just run:
+
+.. code-block:: bash
+
+   python zero_to_fp32.py global_step1 pytorch_model.bin
+
+The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint.
+
+``python zero_to_fp32.py -h`` will give you usage details.
+
+If you have multiple DeepSpeed checkpoint sub-folders, pick the one you know to have the desired weights.
+
+This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs.
+
+Note: currently the script requires 2x general RAM of the final fp32 model weights.
+
+ZeRO 3 Nuances
+=======================================================================================================================
+
+ZeRO 3 is quite different from ZeRO 2 because of its param sharding feature.
+
+While all the efforts were made for things to just work without needing any special changes to your models, in certain
+circumstances you may find the following information to be needed.
+
+
+Registering External Parameters
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+If layer A needs to access weights belonging to layer B, currently layer A needs to tell DeepSpeed about it. This is
+done with the help of ``deepspeed.zero.register_external_parameter`` that needs to be called in ``A.__init__`` and can
+be seen in the following example:
+
+.. code-block:: python
+
+   class ModuleZ3(torch.nn.Module):
+       def __init__(self, *args):
+           super().__init__(self, *args)
+           self.layer1 = SomeLayer()
+           self.layer2 = OtherLayer()
+           deepspeed.zero.register_external_parameter(self, self.layer1.weight)
+
+       def forward(self, input):
+           x = self.layer1(input)
+           # self.layer1.weight is needed in ModuleZ3.forward
+           y = self.layer2(x, self.layer1.weight)
+           return y
+
+In general ``transformers`` models don't use this style of referring to other layer's weights so most likely you won't
+need to use it.
+
+For full details on this method please refer to `Registering External Parameters
+<https://deepspeed.readthedocs.io/en/latest/zero3.html#registering-external-parameters>`__.
+
+
+
+Constructing Massive Models
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+DeepSpeed/ZeRO-3 can handle models with Trillions of parameters which may not fit onto the existing RAM. In such cases,
+but also if you want the initialization to happen much faster, initialize the model using `deepspeed.zero.Init()`
+context manager (which is also a function decorator), like so:
+
+.. code-block:: python
+
+    from transformers import T5ForConditionalGeneration, T5Config
+    import deepspeed
+    with deepspeed.zero.Init():
+       config = T5Config.from_pretrained("t5-small")
+       model = T5ForConditionalGeneration(config)
+
+As you can see this gives you a randomly initialized model.
+
+If you want to use a pretrained model, ``model_class.from_pretrained`` will activate this feature as long as
+``is_deepspeed_zero3_enabled()`` returns ``True``, which can be set manually via ``deepspeed_zero3_enable(True)``.
+Therefore to enable this feature here is the required sequence:
+
+.. code-block:: python
+
+    from transformers.integrations import deepspeed_zero3_enable
+    deepspeed_zero3_enable(True)
+    model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+If you're using ``Trainer`` command line arguments which include ``--deepspeed ds_config.json`` with ZeRO-3 config
+enabled, then you can skip ``deepspeed_zero3_enable(True)`` as it will try to discover whether it'll be run under
+ZeRO-3 and ``from_pretrained`` will automatically activate this feature.
+
+Note: If the fp16 weights of the model can't fit onto the memory of a single GPU this feature must be used.
+
+For full details on this method and other related features please refer to `Constructing Massive Models
+<https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models>`__.
+
+
+
+
+
+Gathering Parameters
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Under ZeRO-3 on multiple GPUs no single GPU has all the parameters unless it's the parameters for the currently
+executing layer. So if you need to access all parameters from all layers at once there is a specific method to do it.
+Most likely you won't need it, but if you do please refer to `Gathering Parameters
+<https://deepspeed.readthedocs.io/en/latest/zero3.html#manual-parameter-coordination>`__
+
+We do however use it internally in several places, one such example is when loading pretrained model weights in
+``from_pretrained``. We load one layer at a time and immediately partition it to all participating GPUs, as for very
+large models it won't be possible to load it on one GPU and then spread it out to multiple GPUs, due to memory
+limitations.
+
+Also under ZeRO-3, if you write your own code and run into a model parameter weight that looks like:
+
+.. code-block:: python
+
+   tensor([1.], device='cuda:0', dtype=torch.float16, requires_grad=True)
+
+stress on ``tensor([1.])``, or if you get an error where it says the parameter is of size ``1``, instead of some much
+larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
+
+
+
+
+
 Notes
 =======================================================================================================================
 
diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
index 46473010862466..e0e76c77958dd4 100644
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -44,6 +44,13 @@ AutoTokenizer
     :members:
 
 
+AutoFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoFeatureExtractor
+    :members:
+
+
 AutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 881060df1883ec..658006f5434a02 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -90,7 +90,7 @@ BertForPreTraining
     :members: forward
 
 
-BertModelLMHeadModel
+BertLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.BertLMHeadModel
diff --git a/docs/source/model_doc/convbert.rst b/docs/source/model_doc/convbert.rst
index 80ed9ebc37b677..69f74733549b0c 100644
--- a/docs/source/model_doc/convbert.rst
+++ b/docs/source/model_doc/convbert.rst
@@ -56,8 +56,7 @@ ConvBertTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.ConvBertTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+    :members:
 
 
 ConvBertModel
diff --git a/docs/source/model_doc/cpm.rst b/docs/source/model_doc/cpm.rst
new file mode 100644
index 00000000000000..e1380f4a933d4b
--- /dev/null
+++ b/docs/source/model_doc/cpm.rst
@@ -0,0 +1,44 @@
+..
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+CPM
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CPM model was proposed in `CPM: A Large-scale Generative Chinese Pre-trained Language Model
+<https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin,
+Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen,
+Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+
+The abstract from the paper is the following:
+
+*Pre-trained Language Models (PLMs) have proven to be beneficial for various downstream NLP tasks. Recently, GPT-3,
+with 175 billion parameters and 570GB training data, drew a lot of attention due to the capacity of few-shot (even
+zero-shot) learning. However, applying GPT-3 to address Chinese NLP tasks is still challenging, as the training corpus
+of GPT-3 is primarily English, and the parameters are not publicly available. In this technical report, we release the
+Chinese Pre-trained Language Model (CPM) with generative pre-training on large-scale Chinese training data. To the best
+of our knowledge, CPM, with 2.6 billion parameters and 100GB Chinese training data, is the largest Chinese pre-trained
+language model, which could facilitate several downstream Chinese NLP tasks, such as conversation, essay generation,
+cloze test, and language understanding. Extensive experiments demonstrate that CPM achieves strong performance on many
+NLP tasks in the settings of few-shot (even zero-shot) learning.*
+
+The original implementation can be found here: https://github.com/TsinghuaAI/CPM-Generate
+
+Note: We only have a tokenizer here, since the model architecture is the same as GPT-2.
+
+CpmTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CpmTokenizer
+    :members:
diff --git a/docs/source/model_doc/gpt_neo.rst b/docs/source/model_doc/gpt_neo.rst
index 652c613a34e530..3a164ee87928ce 100644
--- a/docs/source/model_doc/gpt_neo.rst
+++ b/docs/source/model_doc/gpt_neo.rst
@@ -38,9 +38,9 @@ The :obj:`generate()` method can be used to generate text using GPT Neo model.
     ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
     ...          "researchers was the fact that the unicorns spoke perfect English."
 
-    >>> input_ids = tokenizer(unicorns, return_tensors="pt").input_ids
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 
-    >>> gen_tokens = model.generate(ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
     >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 
 
diff --git a/docs/source/model_doc/led.rst b/docs/source/model_doc/led.rst
index 4dbdbbaeb3df39..83a9386165577b 100644
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -73,8 +73,7 @@ LEDTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.LEDTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+    :members:
 
 
 LED specific outputs
diff --git a/docs/source/model_doc/megatron_bert.rst b/docs/source/model_doc/megatron_bert.rst
new file mode 100644
index 00000000000000..853f09b9b42042
--- /dev/null
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -0,0 +1,153 @@
+.. 
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MegatronBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained `BERT-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
+<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+
+Alternatively, you can directly download the checkpoints using:
+
+BERT-345M-uncased::
+
+.. code-block:: bash
+
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
+  -O megatron_bert_345m_v0_1_uncased.zip
+
+BERT-345M-cased::
+
+.. code-block:: bash
+
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
+  megatron_bert_345m_v0_1_cased.zip
+
+Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
+easily be loaded by Hugging Face Transformers and our port of the BERT code.
+
+The following commands allow you to do the conversion. We assume that the folder ``models/megatron_bert`` contains
+``megatron_bert_345m_v0_1_{cased, uncased}.zip`` and that the commands are run from inside that folder::
+
+.. code-block:: bash
+
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip 
+
+.. code-block:: bash
+
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+
+The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
+and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
+approach using "tensor parallel" and "pipeline parallel" techniques.
+
+MegatronBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertConfig
+    :members:
+
+
+MegatronBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertModel
+    :members: forward
+
+
+MegatronBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForMaskedLM
+    :members: forward
+
+
+MegatronBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForCausalLM
+    :members: forward
+
+
+MegatronBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForNextSentencePrediction
+    :members: forward
+
+
+MegatronBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForPreTraining
+    :members: forward
+
+
+MegatronBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForSequenceClassification
+    :members: forward
+
+
+MegatronBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForMultipleChoice
+    :members: forward
+
+
+MegatronBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForTokenClassification
+    :members: forward
+
+
+MegatronBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForQuestionAnswering
+    :members: forward
+
+
diff --git a/docs/source/model_doc/megatron_gpt2.rst b/docs/source/model_doc/megatron_gpt2.rst
new file mode 100644
index 00000000000000..8a7659acd7ab89
--- /dev/null
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -0,0 +1,70 @@
+.. 
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MegatronGPT2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained `GPT2-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
+<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+
+Alternatively, you can directly download the checkpoints using::
+
+.. code-block:: bash
+
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
+  megatron_gpt2_345m_v0_0.zip
+
+Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
+be loaded by Hugging Face Transformers GPT2 implementation.
+
+The following command allows you to do the conversion. We assume that the folder ``models/megatron_gpt2`` contains
+``megatron_gpt2_345m_v0_0.zip`` and that the command is run from that folder::
+
+.. code-block:: bash
+
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
+
+The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
+and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
+approach using "tensor parallel" and "pipeline parallel" techniques.
+
diff --git a/docs/source/testing.rst b/docs/source/testing.rst
index 10ad3e23111d65..9a4efb06fcb85f 100644
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1,4 +1,4 @@
-.. 
+..
     Copyright 2020 The HuggingFace Team. All rights reserved.
 
     Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -388,7 +388,7 @@ For a single or a group of tests via ``pytest`` (after ``pip install pytest-pspe
 
 .. code-block:: bash
 
-   pytest --pspec tests/test_optimization.py 
+   pytest --pspec tests/test_optimization.py
 
 
 
@@ -672,7 +672,7 @@ and it will list:
 
     test_this2.py::test_floor[integer-1-1.0]
     test_this2.py::test_floor[negative--1.5--2.0]
-    test_this2.py::test_floor[large fraction-1.6-1]       
+    test_this2.py::test_floor[large fraction-1.6-1]
 
 So now you can run just the specific test:
 
@@ -795,6 +795,23 @@ leave any data in there.
    otherwise.
 
 
+Temporary sys.path override
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to temporary override ``sys.path`` to import from another test for example, you can use the
+``ExtendSysPath`` context manager. Example:
+
+
+.. code-block:: python
+
+    import os
+    from transformers.testing_utils import ExtendSysPath
+    bindir = os.path.abspath(os.path.dirname(__file__))
+    with ExtendSysPath(f"{bindir}/.."):
+        from test_trainer import TrainerIntegrationCommon  # noqa
+
+
+
 Skipping tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index db595b645767ca..505f8f68c4ca83 100755
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -43,12 +43,13 @@
     default_data_collator,
     set_seed,
 )
+from transformers.testing_utils import CaptureLogger
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -136,8 +137,8 @@ class DataTrainingArguments:
     block_size: Optional[int] = field(
         default=None,
         metadata={
-            "help": "Optional input sequence length after tokenization."
-            "The training dataset will be truncated in block of this size for training."
+            "help": "Optional input sequence length after tokenization. "
+            "The training dataset will be truncated in block of this size for training. "
             "Default to the model max input length for single sentence inputs (take into account special tokens)."
         },
     )
@@ -316,8 +317,18 @@ def main():
         column_names = datasets["validation"].column_names
     text_column_name = "text" if "text" in column_names else column_names[0]
 
+    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+
     def tokenize_function(examples):
-        return tokenizer(examples[text_column_name])
+        with CaptureLogger(tok_logger) as cl:
+            output = tokenizer(examples[text_column_name])
+        # clm input could be much much longer than block_size
+        if "Token indices sequence length is longer than the" in cl.out:
+            tok_logger.warning(
+                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
+            )
+        return output
 
     tokenized_datasets = datasets.map(
         tokenize_function,
@@ -330,14 +341,14 @@ def tokenize_function(examples):
     if data_args.block_size is None:
         block_size = tokenizer.model_max_length
         if block_size > 1024:
-            logger.warn(
+            logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                 "Picking 1024 instead. You can change that default value by passing --block_size xxx."
             )
         block_size = 1024
     else:
         if data_args.block_size > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                 f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
             )
diff --git a/examples/language-modeling/run_clm_no_trainer.py b/examples/language-modeling/run_clm_no_trainer.py
index 559501dd7589f6..70fabd31df19c7 100755
--- a/examples/language-modeling/run_clm_no_trainer.py
+++ b/examples/language-modeling/run_clm_no_trainer.py
@@ -305,14 +305,14 @@ def tokenize_function(examples):
     if args.block_size is None:
         block_size = tokenizer.model_max_length
         if block_size > 1024:
-            logger.warn(
+            logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                 "Picking 1024 instead. You can change that default value by passing --block_size xxx."
             )
         block_size = 1024
     else:
         if args.block_size > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                 f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
             )
diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
index 627618ff5d38d8..2934fb0c23e813 100755
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -324,14 +324,14 @@ def main():
     if data_args.max_seq_length is None:
         max_seq_length = tokenizer.model_max_length
         if max_seq_length > 1024:
-            logger.warn(
+            logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                 "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
             )
             max_seq_length = 1024
     else:
         if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                 f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
             )
@@ -422,7 +422,12 @@ def group_texts(examples):
 
     # Data collator
     # This one will take care of randomly masking the tokens.
-    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm_probability=data_args.mlm_probability,
+        pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
+    )
 
     # Initialize our Trainer
     trainer = Trainer(
diff --git a/examples/language-modeling/run_mlm_no_trainer.py b/examples/language-modeling/run_mlm_no_trainer.py
index 71a3bbe0c5a963..1cf1c242ab2150 100755
--- a/examples/language-modeling/run_mlm_no_trainer.py
+++ b/examples/language-modeling/run_mlm_no_trainer.py
@@ -308,14 +308,14 @@ def main():
     if args.max_seq_length is None:
         max_seq_length = tokenizer.model_max_length
         if max_seq_length > 1024:
-            logger.warn(
+            logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                 "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
             )
             max_seq_length = 1024
     else:
         if args.max_seq_length > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                 f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
             )
diff --git a/examples/language-modeling/run_plm.py b/examples/language-modeling/run_plm.py
index 6048604c41cc1e..f5c9c47b72241b 100755
--- a/examples/language-modeling/run_plm.py
+++ b/examples/language-modeling/run_plm.py
@@ -44,7 +44,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -319,7 +319,7 @@ def main():
     text_column_name = "text" if "text" in column_names else column_names[0]
 
     if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
diff --git a/examples/legacy/question-answering/run_squad.py b/examples/legacy/question-answering/run_squad.py
index ff693ad24ddae0..84986eff6fec2f 100644
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -436,7 +436,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
                 raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
 
             if args.version_2_with_negative:
-                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
+                logger.warning("tensorflow_datasets does not handle version 2 of SQuAD.")
 
             tfds_examples = tfds.load("squad")
             examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
diff --git a/examples/legacy/seq2seq/README.md b/examples/legacy/seq2seq/README.md
index 623b731d0d9e79..e4a8fff92b4c39 100644
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@@ -28,7 +28,7 @@ For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.
 - `FSMTForConditionalGeneration`
 - `T5ForConditionalGeneration`
 
-### Downlowd the Datasets
+### Download the Datasets
 
 #### XSUM
 
diff --git a/examples/legacy/seq2seq/seq2seq_trainer.py b/examples/legacy/seq2seq/seq2seq_trainer.py
index cba3e958e9c669..075e9f728b1d0a 100644
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@@ -73,7 +73,7 @@ def __init__(self, config=None, data_args=None, *args, **kwargs):
             ), "Make sure that `config.pad_token_id` is correcly defined when ignoring `pad_token` for loss calculation or doing label smoothing."
 
         if self.config.pad_token_id is None and self.config.eos_token_id is not None:
-            logger.warn(
+            logger.warning(
                 f"The `config.pad_token_id` is `None`. Using `config.eos_token_id` = {self.config.eos_token_id} for padding.."
             )
 
@@ -127,7 +127,7 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
         if self.lr_scheduler is None:
             self.lr_scheduler = self._get_lr_scheduler(num_training_steps)
         else:  # ignoring --lr_scheduler
-            logger.warn("scheduler is passed to `Seq2SeqTrainer`, `--lr_scheduler` arg is ignored.")
+            logger.warning("scheduler is passed to `Seq2SeqTrainer`, `--lr_scheduler` arg is ignored.")
 
     def _get_lr_scheduler(self, num_training_steps):
         schedule_func = arg_to_scheduler[self.args.lr_scheduler]
diff --git a/examples/multiple-choice/run_swag.py b/examples/multiple-choice/run_swag.py
index 10af91ee6a67a3..04ad05affd8915 100755
--- a/examples/multiple-choice/run_swag.py
+++ b/examples/multiple-choice/run_swag.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -310,14 +310,14 @@ def main():
     if data_args.max_seq_length is None:
         max_seq_length = tokenizer.model_max_length
         if max_seq_length > 1024:
-            logger.warn(
+            logger.warning(
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                 "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
             )
             max_seq_length = 1024
     else:
         if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                 f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                 f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
             )
diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
index 314d71578f6e94..fa76110b5139d0 100755
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -324,7 +324,7 @@ def main():
     pad_on_right = tokenizer.padding_side == "right"
 
     if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
diff --git a/examples/question-answering/run_qa_beam_search.py b/examples/question-answering/run_qa_beam_search.py
index 36bd9a0d75e20a..7a6d0b5bb43372 100755
--- a/examples/question-answering/run_qa_beam_search.py
+++ b/examples/question-answering/run_qa_beam_search.py
@@ -45,7 +45,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -313,7 +313,7 @@ def main():
     pad_on_right = tokenizer.padding_side == "right"
 
     if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
diff --git a/examples/question-answering/run_qa_beam_search_no_trainer.py b/examples/question-answering/run_qa_beam_search_no_trainer.py
new file mode 100644
index 00000000000000..ca0d60c0f8d128
--- /dev/null
+++ b/examples/question-answering/run_qa_beam_search_no_trainer.py
@@ -0,0 +1,797 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning a 🤗 Transformers model on question answering.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import numpy as np
+import torch
+from datasets import load_dataset, load_metric
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator
+from transformers import (
+    AdamW,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    SchedulerType,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizerFast,
+    default_data_collator,
+    get_scheduler,
+    set_seed,
+)
+from transformers.utils import check_min_version
+from utils_qa import postprocess_qa_predictions_with_beam_search
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.5.0.dev0")
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a Question Answering task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument("--do_predict", action="store_true", help="Eval the question answering model")
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=384,
+        help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+        " sequences shorter will be padded if `--pad_to_max_lengh` is passed.",
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_seq_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--doc_stride",
+        type=int,
+        default=128,
+        help="When splitting up a long document into chunks how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--n_best_size",
+        type=int,
+        default=20,
+        help="The total number of n-best predictions to generate when looking for an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="The threshold used to select the null answer: if the best answer has a score that is less than "
+        "the score of the null answer minus this threshold, the null answer is selected for this example. "
+        "Only useful when `version_2_with_negative=True`.",
+    )
+    parser.add_argument(
+        "--version_2_with_negative",
+        type=bool,
+        default=False,
+        help="If true, some of the examples do not have an answer.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        type=int,
+        default=30,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of training examples to this "
+        "value if set.",
+    )
+    parser.add_argument(
+        "--max_val_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of validation examples to this "
+        "value if set.",
+    )
+    parser.add_argument(
+        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument(
+        "--max_test_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of test examples to this",
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+    else:
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = XLNetConfig.from_pretrained(args.model_name_or_path)
+    tokenizer = XLNetTokenizerFast.from_pretrained(args.model_name_or_path)
+    model = XLNetForQuestionAnswering.from_pretrained(
+        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
+    )
+
+    # Preprocessing the datasets.
+    # Preprocessing is slighlty different for training and evaluation.
+    column_names = raw_datasets["train"].column_names
+
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+
+    max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            return_special_tokens_mask=True,
+            return_token_type_ids=True,
+            padding="max_length",
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
+        special_tokens = tokenized_examples.pop("special_tokens_mask")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+        tokenized_examples["is_impossible"] = []
+        tokenized_examples["cls_index"] = []
+        tokenized_examples["p_mask"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+            tokenized_examples["cls_index"].append(cls_index)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples["token_type_ids"][i]
+            for k, s in enumerate(special_tokens[i]):
+                if s:
+                    sequence_ids[k] = 3
+            context_idx = 1 if pad_on_right else 0
+
+            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
+            # The cls token gets 1.0 too (for predictions of empty answers).
+            tokenized_examples["p_mask"].append(
+                [
+                    0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
+                    for k, s in enumerate(sequence_ids)
+                ]
+            )
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+                tokenized_examples["is_impossible"].append(1.0)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != context_idx:
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != context_idx:
+                    token_end_index -= 1
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                    tokenized_examples["is_impossible"].append(1.0)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+                    tokenized_examples["is_impossible"].append(0.0)
+
+        return tokenized_examples
+
+    if "train" not in raw_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = raw_datasets["train"]
+    if args.max_train_samples is not None:
+        # We will select sample from whole data if agument is specified
+        train_dataset = train_dataset.select(range(args.max_train_samples))
+    # Create train feature from dataset
+    train_dataset = train_dataset.map(
+        prepare_train_features,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+    if args.max_train_samples is not None:
+        # Number of samples might increase during Feature Creation, We select only specified max samples
+        train_dataset = train_dataset.select(range(args.max_train_samples))
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            return_special_tokens_mask=True,
+            return_token_type_ids=True,
+            padding="max_length",
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
+        special_tokens = tokenized_examples.pop("special_tokens_mask")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
+        tokenized_examples["cls_index"] = []
+        tokenized_examples["p_mask"] = []
+
+        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
+            # Find the CLS token in the input ids.
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+            tokenized_examples["cls_index"].append(cls_index)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples["token_type_ids"][i]
+            for k, s in enumerate(special_tokens[i]):
+                if s:
+                    sequence_ids[k] = 3
+            context_idx = 1 if pad_on_right else 0
+
+            # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
+            tokenized_examples["p_mask"].append(
+                [
+                    0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
+                    for k, s in enumerate(sequence_ids)
+                ]
+            )
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_idx else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if "validation" not in raw_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_examples = raw_datasets["validation"]
+    if args.max_val_samples is not None:
+        # We will select sample from whole data
+        eval_examples = eval_examples.select(range(args.max_val_samples))
+    # Validation Feature Creation
+    eval_dataset = eval_examples.map(
+        prepare_validation_features,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+
+    if args.max_val_samples is not None:
+        # During Feature creation dataset samples might increase, we will select required samples again
+        eval_dataset = eval_dataset.select(range(args.max_val_samples))
+
+    if args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        test_examples = raw_datasets["test"]
+        if args.max_test_samples is not None:
+            # We will select sample from whole data
+            test_examples = test_examples.select(range(args.max_test_samples))
+        # Test Feature Creation
+        test_dataset = test_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+        )
+        if args.max_test_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            test_dataset = test_dataset.select(range(args.max_test_samples))
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    if args.pad_to_max_length:
+        # If padding was already done ot max length, we use the default data collator that will just convert everything
+        # to tensors.
+        data_collator = default_data_collator
+    else:
+        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
+        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
+        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
+    )
+
+    eval_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
+
+    if args.do_predict:
+        test_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+        test_dataloader = DataLoader(
+            test_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
+        )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=args.version_2_with_negative,
+            n_best_size=args.n_best_size,
+            max_answer_length=args.max_answer_length,
+            start_n_top=model.config.start_n_top,
+            end_n_top=model.config.end_n_top,
+            output_dir=args.output_dir,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]}
+                for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
+
+    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
+        """
+        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+
+        Args:
+            start_or_end_logits(:obj:`tensor`):
+                This is the output predictions of the model. We can only enter either start or end logits.
+            eval_dataset: Evaluation dataset
+            max_len(:obj:`int`):
+                The maximum length of the output tensor. ( See the model.eval() part for more details )
+        """
+
+        step = 0
+        # create a numpy array and fill it with -100.
+        logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float32)
+        # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
+        for i, output_logit in enumerate(start_or_end_logits):  # populate columns
+            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
+            # And after every iteration we have to change the step
+
+            batch_size = output_logit.shape[0]
+            cols = output_logit.shape[1]
+            if step + batch_size < len(dataset):
+                logits_concat[step : step + batch_size, :cols] = output_logit
+            else:
+                logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
+
+            step += batch_size
+
+        return logits_concat
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        # intialize all lists to collect the batches
+
+    all_start_top_log_probs = []
+    all_start_top_index = []
+    all_end_top_log_probs = []
+    all_end_top_index = []
+    all_cls_logits = []
+    for step, batch in enumerate(eval_dataloader):
+        with torch.no_grad():
+            outputs = model(**batch)
+            start_top_log_probs = outputs.start_top_log_probs
+            start_top_index = outputs.start_top_index
+            end_top_log_probs = outputs.end_top_log_probs
+            end_top_index = outputs.end_top_index
+            cls_logits = outputs.cls_logits
+
+            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                start_top_log_probs = accelerator.pad_across_processes(start_top_log_probs, dim=1, pad_index=-100)
+                start_top_index = accelerator.pad_across_processes(start_top_index, dim=1, pad_index=-100)
+                end_top_log_probs = accelerator.pad_across_processes(end_top_log_probs, dim=1, pad_index=-100)
+                end_top_index = accelerator.pad_across_processes(end_top_index, dim=1, pad_index=-100)
+                cls_logits = accelerator.pad_across_processes(cls_logits, dim=1, pad_index=-100)
+
+            all_start_top_log_probs.append(accelerator.gather(start_top_log_probs).cpu().numpy())
+            all_start_top_index.append(accelerator.gather(start_top_index).cpu().numpy())
+            all_end_top_log_probs.append(accelerator.gather(end_top_log_probs).cpu().numpy())
+            all_end_top_index.append(accelerator.gather(end_top_index).cpu().numpy())
+            all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy())
+
+    max_len = max([x.shape[1] for x in all_end_top_log_probs])  # Get the max_length of the tensor
+
+    # concatenate all numpy arrays collected above
+    start_top_log_probs_concat = create_and_fill_np_array(all_start_top_log_probs, eval_dataset, max_len)
+    start_top_index_concat = create_and_fill_np_array(all_start_top_index, eval_dataset, max_len)
+    end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, eval_dataset, max_len)
+    end_top_index_concat = create_and_fill_np_array(all_end_top_index, eval_dataset, max_len)
+    all_cls_logits = np.concatenate(all_cls_logits, axis=0)
+
+    # delete the list of numpy arrays
+    del start_top_log_probs
+    del start_top_index
+    del end_top_log_probs
+    del end_top_index
+
+    eval_dataset.set_format(type=None, columns=list(eval_dataset.features.keys()))
+    outputs_numpy = (
+        start_top_log_probs_concat,
+        start_top_index_concat,
+        end_top_log_probs_concat,
+        end_top_index_concat,
+        cls_logits,
+    )
+    prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+    eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+    logger.info(f"Evaluation metrics: {eval_metric}")
+
+    if args.do_predict:
+        # intialize all lists to collect the batches
+
+        all_start_top_log_probs = []
+        all_start_top_index = []
+        all_end_top_log_probs = []
+        all_end_top_index = []
+        all_cls_logits = []
+        for step, batch in enumerate(test_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+                start_top_log_probs = outputs.start_top_log_probs
+                start_top_index = outputs.start_top_index
+                end_top_log_probs = outputs.end_top_log_probs
+                end_top_index = outputs.end_top_index
+                cls_logits = outputs.cls_logits
+
+                if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                    start_top_log_probs = accelerator.pad_across_processes(start_top_log_probs, dim=1, pad_index=-100)
+                    start_top_index = accelerator.pad_across_processes(start_top_index, dim=1, pad_index=-100)
+                    end_top_log_probs = accelerator.pad_across_processes(end_top_log_probs, dim=1, pad_index=-100)
+                    end_top_index = accelerator.pad_across_processes(end_top_index, dim=1, pad_index=-100)
+                    cls_logits = accelerator.pad_across_processes(cls_logits, dim=1, pad_index=-100)
+
+                all_start_top_log_probs.append(accelerator.gather(start_top_log_probs).cpu().numpy())
+                all_start_top_index.append(accelerator.gather(start_top_index).cpu().numpy())
+                all_end_top_log_probs.append(accelerator.gather(end_top_log_probs).cpu().numpy())
+                all_end_top_index.append(accelerator.gather(end_top_index).cpu().numpy())
+                all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy())
+
+        max_len = max([x.shape[1] for x in all_end_top_log_probs])  # Get the max_length of the tensor
+
+        # concatenate all numpy arrays collected above
+        start_top_log_probs_concat = create_and_fill_np_array(all_start_top_log_probs, test_dataset, max_len)
+        start_top_index_concat = create_and_fill_np_array(all_start_top_index, test_dataset, max_len)
+        end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, test_dataset, max_len)
+        end_top_index_concat = create_and_fill_np_array(all_end_top_index, test_dataset, max_len)
+        all_cls_logits = np.concatenate(all_cls_logits, axis=0)
+
+        # delete the list of numpy arrays
+        del start_top_log_probs
+        del start_top_index
+        del end_top_log_probs
+        del end_top_index
+
+        test_dataset.set_format(type=None, columns=list(test_dataset.features.keys()))
+        outputs_numpy = (
+            start_top_log_probs_concat,
+            start_top_index_concat,
+            end_top_log_probs_concat,
+            end_top_index_concat,
+            cls_logits,
+        )
+
+        prediction = post_processing_function(test_examples, test_dataset, outputs_numpy)
+        test_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+        logger.info(f"Test metrics: {test_metric}")
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/question-answering/run_qa_no_trainer.py b/examples/question-answering/run_qa_no_trainer.py
new file mode 100755
index 00000000000000..7a8b2215be7545
--- /dev/null
+++ b/examples/question-answering/run_qa_no_trainer.py
@@ -0,0 +1,753 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning a 🤗 Transformers model on question answering.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import numpy as np
+import torch
+from datasets import load_dataset, load_metric
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AdamW,
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+    set_seed,
+)
+from transformers.utils import check_min_version
+from utils_qa import postprocess_qa_predictions
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.5.0.dev0")
+
+
+logger = logging.getLogger(__name__)
+# You should update this to your particular problem to have better documentation of `model_type`
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a Question Answering task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument("--do_predict", action="store_true", help="Eval the question answering model")
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=384,
+        help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+        " sequences shorter will be padded if `--pad_to_max_lengh` is passed.",
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_seq_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--doc_stride",
+        type=int,
+        default=128,
+        help="When splitting up a long document into chunks how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--n_best_size",
+        type=int,
+        default=20,
+        help="The total number of n-best predictions to generate when looking for an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="The threshold used to select the null answer: if the best answer has a score that is less than "
+        "the score of the null answer minus this threshold, the null answer is selected for this example. "
+        "Only useful when `version_2_with_negative=True`.",
+    )
+    parser.add_argument(
+        "--version_2_with_negative",
+        type=bool,
+        default=False,
+        help="If true, some of the examples do not have an answer.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        type=int,
+        default=30,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of training examples to this "
+        "value if set.",
+    )
+    parser.add_argument(
+        "--max_val_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of validation examples to this "
+        "value if set.",
+    )
+    parser.add_argument(
+        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument(
+        "--max_test_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of test examples to this",
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+    else:
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if args.model_name_or_path:
+        model = AutoModelForQuestionAnswering.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForQuestionAnswering.from_config(config)
+
+    # Preprocessing the datasets.
+    # Preprocessing is slighlty different for training and evaluation.
+
+    column_names = raw_datasets["train"].column_names
+
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+
+    max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    if "train" not in raw_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = raw_datasets["train"]
+    if args.max_train_samples is not None:
+        # We will select sample from whole data if agument is specified
+        train_dataset = train_dataset.select(range(args.max_train_samples))
+    # Create train feature from dataset
+    train_dataset = train_dataset.map(
+        prepare_train_features,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+    if args.max_train_samples is not None:
+        # Number of samples might increase during Feature Creation, We select only specified max samples
+        train_dataset = train_dataset.select(range(args.max_train_samples))
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if "validation" not in raw_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_examples = raw_datasets["validation"]
+    if args.max_val_samples is not None:
+        # We will select sample from whole data
+        eval_examples = eval_examples.select(range(args.max_val_samples))
+    # Validation Feature Creation
+    eval_dataset = eval_examples.map(
+        prepare_validation_features,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+
+    if args.max_val_samples is not None:
+        # During Feature creation dataset samples might increase, we will select required samples again
+        eval_dataset = eval_dataset.select(range(args.max_val_samples))
+
+    if args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        test_examples = raw_datasets["test"]
+        if args.max_test_samples is not None:
+            # We will select sample from whole data
+            test_examples = test_examples.select(range(args.max_test_samples))
+        # Test Feature Creation
+        test_dataset = test_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+        )
+        if args.max_test_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            test_dataset = test_dataset.select(range(args.max_test_samples))
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    if args.pad_to_max_length:
+        # If padding was already done ot max length, we use the default data collator that will just convert everything
+        # to tensors.
+        data_collator = default_data_collator
+    else:
+        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
+        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
+        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
+    )
+
+    eval_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
+
+    if args.do_predict:
+        test_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+        test_dataloader = DataLoader(
+            test_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
+        )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=args.version_2_with_negative,
+            n_best_size=args.n_best_size,
+            max_answer_length=args.max_answer_length,
+            null_score_diff_threshold=args.null_score_diff_threshold,
+            output_dir=args.output_dir,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
+
+    # Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
+        """
+        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+
+        Args:
+            start_or_end_logits(:obj:`tensor`):
+                This is the output predictions of the model. We can only enter either start or end logits.
+            eval_dataset: Evaluation dataset
+            max_len(:obj:`int`):
+                The maximum length of the output tensor. ( See the model.eval() part for more details )
+        """
+
+        step = 0
+        # create a numpy array and fill it with -100.
+        logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64)
+        # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
+        for i, output_logit in enumerate(start_or_end_logits):  # populate columns
+            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
+            # And after every iteration we have to change the step
+
+            batch_size = output_logit.shape[0]
+            cols = output_logit.shape[1]
+
+            if step + batch_size < len(dataset):
+                logits_concat[step : step + batch_size, :cols] = output_logit
+            else:
+                logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
+
+            step += batch_size
+
+        return logits_concat
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+    # Validation
+    all_start_logits = []
+    all_end_logits = []
+    for step, batch in enumerate(eval_dataloader):
+        with torch.no_grad():
+            outputs = model(**batch)
+            start_logits = outputs.start_logits
+            end_logits = outputs.end_logits
+
+            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
+                end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)
+
+            all_start_logits.append(accelerator.gather(start_logits).cpu().numpy())
+            all_end_logits.append(accelerator.gather(end_logits).cpu().numpy())
+
+    max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+
+    # concatenate the numpy array
+    start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
+    end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)
+
+    # delete the list of numpy arrays
+    del all_start_logits
+    del all_end_logits
+
+    eval_dataset.set_format(type=None, columns=list(eval_dataset.features.keys()))
+    outputs_numpy = (start_logits_concat, end_logits_concat)
+    prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+    eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+    logger.info(f"Evaluation metrics: {eval_metric}")
+
+    # Prediction
+    if args.do_predict:
+        all_start_logits = []
+        all_end_logits = []
+        for step, batch in enumerate(test_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+                start_logits = outputs.start_logits
+                end_logits = outputs.end_logits
+
+                if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                    start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
+                    end_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
+
+                all_start_logits.append(accelerator.gather(start_logits).cpu().numpy())
+                all_end_logits.append(accelerator.gather(end_logits).cpu().numpy())
+
+        max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+        # concatenate the numpy array
+        start_logits_concat = create_and_fill_np_array(all_start_logits, test_dataset, max_len)
+        end_logits_concat = create_and_fill_np_array(all_end_logits, test_dataset, max_len)
+
+        # delete the list of numpy arrays
+        del all_start_logits
+        del all_end_logits
+
+        # Now we need to add extra columns which we removed for post processing
+        test_dataset.set_format(type=None, columns=list(test_dataset.features.keys()))
+        outputs_numpy = (start_logits_concat, end_logits_concat)
+        prediction = post_processing_function(test_examples, test_dataset, outputs_numpy)
+        eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+        logger.info(f"Test metrics: {eval_metric}")
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/question-answering/run_tf_squad.py b/examples/question-answering/run_tf_squad.py
index 0cad705433ba0c..20723f70e8fdae 100755
--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@@ -181,7 +181,7 @@ def main():
     # Get datasets
     if data_args.use_tfds:
         if data_args.version_2_with_negative:
-            logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically")
+            logger.warning("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically")
 
         try:
             import tensorflow_datasets as tfds
diff --git a/examples/question-answering/utils_qa.py b/examples/question-answering/utils_qa.py
index 84acb91be7db9b..2f8f0a60c45fe5 100644
--- a/examples/question-answering/utils_qa.py
+++ b/examples/question-answering/utils_qa.py
@@ -335,9 +335,9 @@ def postprocess_qa_predictions_with_beam_search(
             # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
             for i in range(start_n_top):
                 for j in range(end_n_top):
-                    start_index = start_indexes[i]
+                    start_index = int(start_indexes[i])
                     j_index = i * end_n_top + j
-                    end_index = end_indexes[j_index]
+                    end_index = int(end_indexes[j_index])
                     # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
                     # p_mask but let's not take any risk)
                     if (
diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py
index 979649a6be2bc2..9fd219c089068a 100644
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -629,7 +629,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
                 raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
 
             if args.version_2_with_negative:
-                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
+                logger.warning("tensorflow_datasets does not handle version 2 of SQuAD.")
 
             tfds_examples = tfds.load("squad")
             examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
diff --git a/examples/seq2seq/run_summarization.py b/examples/seq2seq/run_summarization.py
index dc02f8c71d8ef9..811c5a524215ff 100755
--- a/examples/seq2seq/run_summarization.py
+++ b/examples/seq2seq/run_summarization.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
@@ -394,7 +394,7 @@ def main():
     padding = "max_length" if data_args.pad_to_max_length else False
 
     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warn(
+        logger.warning(
             "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
         )
diff --git a/examples/seq2seq/run_translation.py b/examples/seq2seq/run_translation.py
index 0755a53413e740..a41da4e0abbeab 100755
--- a/examples/seq2seq/run_translation.py
+++ b/examples/seq2seq/run_translation.py
@@ -34,6 +34,9 @@
     AutoTokenizer,
     DataCollatorForSeq2Seq,
     HfArgumentParser,
+    M2M100Tokenizer,
+    MBart50Tokenizer,
+    MBart50TokenizerFast,
     MBartTokenizer,
     MBartTokenizerFast,
     Seq2SeqTrainer,
@@ -46,10 +49,13 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
+# A list of all multilingual tokenizer which require src_lang and tgt_lang attributes.
+MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast, M2M100Tokenizer]
+
 
 @dataclass
 class ModelArguments:
@@ -191,6 +197,14 @@ class DataTrainingArguments:
     source_prefix: Optional[str] = field(
         default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
     )
+    forced_bos_token: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The token to force as the first generated token after the :obj:`decoder_start_token_id`."
+            "Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token "
+            "needs to be the target language token.(Usually it is the target language token)"
+        },
+    )
 
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -325,9 +339,6 @@ def main():
 
     # Set decoder_start_token_id
     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
-            data_args.target_lang is not None and data_args.source_lang is not None
-        ), "mBart requires --target_lang and --source_lang"
         if isinstance(tokenizer, MBartTokenizer):
             model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
         else:
@@ -352,11 +363,21 @@ def main():
 
     # For translation we set the codes of our source and target languages (only useful for mBART, the others will
     # ignore those attributes).
-    if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        if data_args.source_lang is not None:
-            tokenizer.src_lang = data_args.source_lang
-        if data_args.target_lang is not None:
-            tokenizer.tgt_lang = data_args.target_lang
+    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
+        assert data_args.target_lang is not None and data_args.source_lang is not None, (
+            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and "
+            "--target_lang arguments."
+        )
+
+        tokenizer.src_lang = data_args.source_lang
+        tokenizer.tgt_lang = data_args.target_lang
+
+        # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
+        # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
+        forced_bos_token_id = (
+            tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None
+        )
+        model.config.foced_bos_token_id = forced_bos_token_id
 
     # Get the language codes for input/target.
     source_lang = data_args.source_lang.split("_")[0]
@@ -367,7 +388,7 @@ def main():
     padding = "max_length" if data_args.pad_to_max_length else False
 
     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warn(
+        logger.warning(
             "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
         )
diff --git a/examples/tests/deepspeed/test_deepspeed.py b/examples/tests/deepspeed/test_deepspeed.py
deleted file mode 100644
index acaebc9f32a399..00000000000000
--- a/examples/tests/deepspeed/test_deepspeed.py
+++ /dev/null
@@ -1,427 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-import io
-import json
-import os
-import sys
-import unittest
-from copy import deepcopy
-
-from transformers import TrainingArguments
-from transformers.file_utils import WEIGHTS_NAME
-from transformers.integrations import is_deepspeed_available
-from transformers.testing_utils import (
-    CaptureStd,
-    TestCasePlus,
-    execute_subprocess_async,
-    get_gpu_count,
-    mockenv_context,
-    require_torch_gpu,
-    require_torch_multi_gpu,
-    slow,
-)
-from transformers.trainer_utils import set_seed
-
-
-bindir = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(f"{bindir}/../../../tests")
-from test_trainer import TrainerIntegrationCommon, get_regression_trainer  # noqa
-
-
-set_seed(42)
-MBART_TINY = "sshleifer/tiny-mbart"
-
-
-def load_json(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-# a candidate for testing_utils
-def require_deepspeed(test_case):
-    """
-    Decorator marking a test that requires deepspeed
-    """
-    if not is_deepspeed_available():
-        return unittest.skip("test requires deepspeed")(test_case)
-    else:
-        return test_case
-
-
-@require_deepspeed
-@require_torch_gpu
-class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
-    """
-
-    This class is for testing directly via get_regression_trainer
-
-    It mixes in `TrainerIntegrationCommon` which already has a lot of helper validation methods which we can re-use here.
-    """
-
-    def setUp(self):
-        super().setUp()
-
-        args = TrainingArguments(".")
-        self.n_epochs = args.num_train_epochs
-        self.batch_size = args.train_batch_size
-
-        self.dist_env_1_gpu = dict(
-            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
-        )
-        self.ds_config_file = f"{self.test_file_dir_str}/ds_config.json"
-        with io.open(self.ds_config_file, "r", encoding="utf-8") as f:
-            self.ds_config_dict = json.load(f)
-
-    def test_fake_notebook_no_launcher(self):
-        # this setup emulates a notebook where a launcher needs to be emulated by hand
-        with CaptureStd() as cs:  # noqa
-            with mockenv_context(**self.dist_env_1_gpu):
-                trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file)
-                trainer.train()
-        # fixme:
-        # assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
-
-    # Test various combos
-    # 1. DS scheduler + DS optimizer: this is already tested by most other tests
-    # 2. HF scheduler + HF optimizer:
-    # 3. DS scheduler + HF optimizer:
-    # 4. HF scheduler + DS optimizer:
-
-    def test_hf_scheduler_hf_optimizer(self):
-        a = 0
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = deepcopy(self.ds_config_dict)
-            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
-            del ds_config_dict["scheduler"]  # force default HF Trainer scheduler
-            ds_config_dict["zero_optimization"]["cpu_offload"] = False
-            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_dict)
-            trainer.train()
-        new_a = trainer.model.a.item()
-        self.assertNotEqual(new_a, a)
-
-    def test_ds_scheduler_hf_optimizer(self):
-        a = 0
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = deepcopy(self.ds_config_dict)
-            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
-            ds_config_dict["zero_optimization"]["cpu_offload"] = False
-            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_dict)
-            trainer.train()
-        new_a = trainer.model.a.item()
-        self.assertNotEqual(new_a, a)
-
-    def test_hf_scheduler_ds_optimizer(self):
-        # this combo is not possible at the moment
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = deepcopy(self.ds_config_dict)
-            del ds_config_dict["scheduler"]  # force default HF Trainer scheduler
-            ds_config_dict["zero_optimization"]["cpu_offload"] = False
-            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
-            with self.assertRaises(Exception) as context:
-                trainer.train()
-        self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))
-
-    def test_hf_optimizer_with_offload(self):
-        # must not allow non-DS optimizer when using ZERO-offload
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = deepcopy(self.ds_config_dict)
-            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
-            ds_config_dict["zero_optimization"]["cpu_offload"] = True
-            # sanity check - should the default config change
-            assert (
-                "cpu_offload" in ds_config_dict["zero_optimization"]
-                and ds_config_dict["zero_optimization"]["cpu_offload"] is True
-            ), "ensure the config is set up correctly"
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
-            with self.assertRaises(Exception) as context:
-                trainer.train()
-        self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
-
-    def test_early_get_last_lr(self):
-        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
-        # not run for the first few dozen steps while loss scale is too large, and thus during
-        # that time `get_last_lr` will fail if called during that warm up stage,
-        #
-        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
-        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
-        with mockenv_context(**self.dist_env_1_gpu):
-            a = b = 0.0
-            trainer = get_regression_trainer(
-                a=a,
-                b=b,
-                local_rank=0,
-                train_len=8,
-                deepspeed=self.ds_config_file,
-                per_device_train_batch_size=8,
-                logging_steps=1,
-            )
-            trainer.train()
-            no_grad_accum_a = trainer.model.a.item()
-
-            # it's enough that train didn't fail for this test, but we must check that
-            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
-            self.assertEqual(no_grad_accum_a, a)
-
-    def test_gradient_accumulation(self):
-
-        # this test measures that we get identical weights and similar loss with:
-        # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
-        # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
-        # since the 2nd should produce the effective batch of 1st, with the same results
-        #
-        # I can get an identical loss for a small train_len=32, plus the power of the initial
-        # dynamic loss scale value set to:
-        #   "fp16.initial_scale_power": 1
-        # plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
-        # but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
-        # the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
-
-        train_len = 64
-        a = b = 0.0
-
-        with mockenv_context(**self.dist_env_1_gpu):
-            no_grad_accum_trainer = get_regression_trainer(
-                a=a,
-                b=b,
-                local_rank=0,
-                train_len=train_len,
-                deepspeed=self.ds_config_file,
-                per_device_train_batch_size=8,
-                gradient_accumulation_steps=1,
-            )
-            no_grad_accum_result = no_grad_accum_trainer.train()
-            no_grad_accum_loss = no_grad_accum_result.training_loss
-            no_grad_accum_a = no_grad_accum_trainer.model.a.item()
-            no_grad_accum_b = no_grad_accum_trainer.model.b.item()
-            # make sure the optimizer kicked in - if it hasn't changed from the original value of a then make train_len bigger
-            self.assertNotEqual(no_grad_accum_a, a)
-
-        with mockenv_context(**self.dist_env_1_gpu):
-            yes_grad_accum_trainer = get_regression_trainer(
-                a=a,
-                b=b,
-                local_rank=0,
-                train_len=train_len,
-                deepspeed=self.ds_config_file,
-                per_device_train_batch_size=4,
-                gradient_accumulation_steps=2,
-            )
-            yes_grad_accum_result = yes_grad_accum_trainer.train()
-            yes_grad_accum_loss = yes_grad_accum_result.training_loss
-            yes_grad_accum_a = yes_grad_accum_trainer.model.a.item()
-            yes_grad_accum_b = yes_grad_accum_trainer.model.b.item()
-            self.assertNotEqual(yes_grad_accum_a, a)
-
-        # training with half the batch size but accumulation steps as 2 should give the same weights
-        self.assertEqual(no_grad_accum_a, yes_grad_accum_a)
-        self.assertEqual(no_grad_accum_b, yes_grad_accum_b)
-
-        # see the note above how to get identical loss on a small bs
-        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)
-
-    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, is_pretrained=True):
-        # adapted from TrainerIntegrationCommon.check_saved_checkpoints
-
-        file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
-        ds_file_list = ["mp_rank_00_model_states.pt", "zero_pp_rank_0_mp_rank_00optim_states.pt"]
-
-        for step in range(freq, total, freq):
-            checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
-            self.assertTrue(os.path.isdir(checkpoint))
-
-            # common files
-            for filename in file_list:
-                self.assertTrue(os.path.isfile(os.path.join(checkpoint, filename)))
-
-            # ds files
-            ds_path = os.path.join(checkpoint, f"global_step{step}")
-            for filename in ds_file_list:
-                # filename = os.path.join(path, filename)
-                # print(filename)
-                self.assertTrue(os.path.isfile(os.path.join(ds_path, filename)))
-
-    def test_save_checkpoints(self):
-        # adapted from  TrainerIntegrationTest.test_save_checkpoints
-
-        output_dir = self.get_auto_remove_tmp_dir()
-        ds_config_dict = deepcopy(self.ds_config_dict)
-        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        freq = 5
-
-        # save checkpoints
-        with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(
-                output_dir=output_dir,
-                save_steps=freq,
-                deepspeed=ds_config_dict,
-            )
-            trainer.train()
-
-        total = int(self.n_epochs * 64 / self.batch_size)
-        self.check_saved_checkpoints_deepspeed(output_dir, freq, total)
-
-    def test_can_resume_training(self):
-        # adapted from TrainerIntegrationTest.test_can_resume_training
-
-        output_dir = self.get_auto_remove_tmp_dir()
-        ds_config_dict = deepcopy(self.ds_config_dict)
-        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
-
-        with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(**kwargs)
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
-
-            checkpoint = os.path.join(output_dir, "checkpoint-5")
-
-            # Reinitialize trainer
-            trainer = get_regression_trainer(**kwargs)
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-            # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(output_dir, "checkpoint-15")
-
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-            # Now check failures
-
-            # 1. fail to find a bogus checkpoint
-            trainer = get_regression_trainer(**kwargs)
-            with self.assertRaises(Exception) as context:
-                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
-            self.assertTrue("failed to resume from checkpoint" in str(context.exception))
-
-            # 2. fail to find any checkpoint - due a fresh output_dir
-            output_dir2 = self.get_auto_remove_tmp_dir()
-            trainer = get_regression_trainer(output_dir=output_dir2, deepspeed=ds_config_dict)
-            with self.assertRaises(Exception) as context:
-                trainer.train(resume_from_checkpoint=True)
-            self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
-
-
-@slow
-@require_deepspeed
-@require_torch_gpu
-class TestDeepSpeed(TestCasePlus):
-    """ This class is for testing via an external script """
-
-    @require_torch_multi_gpu
-    def test_basic_distributed(self):
-        self.run_quick(distributed=True)
-
-    def test_do_eval_no_train(self):
-        # we should not fail if train is skipped
-        output_dir = self.run_trainer(
-            eval_steps=1,
-            max_len=12,
-            model_name=MBART_TINY,
-            num_train_epochs=1,
-            distributed=False,
-            extra_args_str="--do_eval",
-            remove_args_str="--do_train",
-        )
-        val_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
-        assert "eval_bleu" in val_metrics
-
-    # XXX: need to do better validation beyond just that the run was successful
-    def run_quick(self, distributed=True, extra_args_str=None, remove_args_str=None):
-        output_dir = self.run_trainer(
-            eval_steps=1,
-            max_len=12,
-            model_name=MBART_TINY,
-            num_train_epochs=1,
-            distributed=distributed,
-            extra_args_str=extra_args_str,
-            remove_args_str=remove_args_str,
-        )
-        train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
-        assert "train_runtime" in train_metrics
-
-    def run_trainer(
-        self,
-        eval_steps: int,
-        max_len: str,
-        model_name: str,
-        num_train_epochs: int,
-        distributed: bool = True,
-        extra_args_str: str = None,
-        remove_args_str: str = None,
-    ):
-        data_dir = self.examples_dir / "test_data/wmt_en_ro"
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"""
-            --model_name_or_path {model_name}
-            --train_file {data_dir}/train.json
-            --validation_file {data_dir}/val.json
-            --output_dir {output_dir}
-            --overwrite_output_dir
-            --max_train_samples 8
-            --max_val_samples 8
-            --max_source_length {max_len}
-            --max_target_length {max_len}
-            --val_max_target_length {max_len}
-            --do_train
-            --num_train_epochs {str(num_train_epochs)}
-            --per_device_train_batch_size 4
-            --learning_rate 3e-3
-            --warmup_steps 8
-            --predict_with_generate
-            --logging_steps 0
-            --save_steps {str(eval_steps)}
-            --group_by_length
-            --label_smoothing_factor 0.1
-            --adafactor
-            --target_lang ro_RO
-            --source_lang en_XX
-        """.split()
-
-        if extra_args_str is not None:
-            args.extend(extra_args_str.split())
-
-        if remove_args_str is not None:
-            remove_args = remove_args_str.split()
-            args = [x for x in args if x not in remove_args]
-
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config.json".split()
-        script = [f"{self.examples_dir_str}/seq2seq/run_translation.py"]
-        num_gpus = get_gpu_count() if distributed else 1
-        launcher = f"deepspeed --num_gpus {num_gpus}".split()
-
-        cmd = launcher + script + args + ds_args
-        # keep for quick debug
-        # print(" ".join([f"PYTHONPATH={self.src_dir_str}"] +cmd)); die
-        execute_subprocess_async(cmd, env=self.get_env())
-
-        return output_dir
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index 82762b6ac8f324..94b52a4bd0ba54 100755
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -45,7 +45,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
@@ -351,7 +351,7 @@ def main():
         if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
         else:
-            logger.warn(
+            logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                 "\nIgnoring the model labels as a result.",
@@ -360,7 +360,7 @@ def main():
         label_to_id = {v: i for i, v in enumerate(label_list)}
 
     if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
diff --git a/examples/text-classification/run_glue_no_trainer.py b/examples/text-classification/run_glue_no_trainer.py
index f02fc0757ceb2c..646d6e93f63d83 100644
--- a/examples/text-classification/run_glue_no_trainer.py
+++ b/examples/text-classification/run_glue_no_trainer.py
@@ -274,7 +274,7 @@ def main():
             )
             label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)}
         else:
-            logger.warn(
+            logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                 "\nIgnoring the model labels as a result.",
diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py
index 2b95e0ca950cea..82a6b0f2a32c42 100755
--- a/examples/text-classification/run_xnli.py
+++ b/examples/text-classification/run_xnli.py
@@ -45,7 +45,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py
index 053a193a60d94d..0fc08644b801d2 100755
--- a/examples/token-classification/run_ner.py
+++ b/examples/token-classification/run_ner.py
@@ -45,7 +45,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0.dev0")
+check_min_version("4.6.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/setup.py b/setup.py
index 60c69ffa062f3b..c403f1f33af1b5 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
 
 1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
    documentation.
-   
+
 2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.
 
 3. Unpin specific versions from setup.py that use a git install.
@@ -85,11 +85,14 @@
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
+    "Pillow",
     "black>=20.8b1",
     "cookiecutter==1.7.2",
     "dataclasses",
     "datasets",
+    "deepspeed>0.3.13",
     "docutils==0.16.0",
+    "fairscale>0.3",
     "faiss-cpu",
     "fastapi",
     "filelock",
@@ -101,14 +104,15 @@
     "isort>=5.5.4",
     "jax>=0.2.8",
     "jaxlib>=0.1.59",
+    "jieba",
     "keras2onnx",
+    "nltk",
     "numpy>=1.17",
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
     "onnxruntime>=1.4.0",
     "packaging",
     "parameterized",
-    "Pillow",
     "protobuf",
     "psutil",
     "pydantic",
@@ -119,7 +123,10 @@
     "recommonmark",
     "regex!=2019.12.17",
     "requests",
+    "rouge-score",
+    "sacrebleu>=1.4.12",
     "sacremoses",
+    "sagemaker>=2.31.0",
     "scikit-learn",
     "sentencepiece==0.1.91",
     "soundfile",
@@ -127,6 +134,7 @@
     "sphinx-markdown-tables",
     "sphinx-rtd-theme==0.4.3",  # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
     "sphinx==3.2.1",
+    "sphinxext-opengraph==0.4.1",
     "starlette",
     "tensorflow-cpu>=2.3",
     "tensorflow>=2.3",
@@ -138,7 +146,6 @@
     "unidic>=1.0.2",
     "unidic_lite>=1.0.7",
     "uvicorn",
-    "sagemaker>=2.31.0",
 ]
 
 
@@ -229,6 +236,8 @@ def run(self):
 extras["modelcreation"] = deps_list("cookiecutter")
 
 extras["sagemaker"] = deps_list("sagemaker")
+extras["deepspeed"] = deps_list("deepspeed")
+extras["fairscale"] = deps_list("fairscale")
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
 extras["speech"] = deps_list("soundfile", "torchaudio")
@@ -237,24 +246,42 @@ def run(self):
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
     deps_list(
-        "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black"
+        "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black", "sacrebleu", "rouge-score", "nltk"
     )
     + extras["retrieval"]
     + extras["modelcreation"]
 )
-extras["docs"] = deps_list(
-    "docutils", "recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme", "sphinx-copybutton"
-)
+
 extras["quality"] = deps_list("black", "isort", "flake8")
 
-extras["all"] = extras["tf"] + extras["torch"] + extras["flax"] + extras["sentencepiece"] + extras["tokenizers"] + extras["speech"] + extras["vision"]
+extras["all"] = (
+    extras["tf"]
+    + extras["torch"]
+    + extras["flax"]
+    + extras["sentencepiece"]
+    + extras["tokenizers"]
+    + extras["speech"]
+    + extras["vision"]
+)
+
+extras["docs_specific"] = deps_list(
+    "docutils",
+    "recommonmark",
+    "sphinx",
+    "sphinx-markdown-tables",
+    "sphinx-rtd-theme",
+    "sphinx-copybutton",
+    "sphinxext-opengraph",
+)
+# "docs" needs "all" to resolve all the references
+extras["docs"] = extras["all"] + extras["docs_specific"]
 
 extras["dev"] = (
     extras["all"]
     + extras["testing"]
     + extras["quality"]
     + extras["ja"]
-    + extras["docs"]
+    + extras["docs_specific"]
     + extras["sklearn"]
     + extras["modelcreation"]
 )
@@ -290,7 +317,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.5.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.6.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
     author_email="thomas@huggingface.co",
     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0b9d366d3cfbcf..f71e075eaaed40 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -22,7 +22,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.5.0.dev0"
+__version__ = "4.6.0.dev0"
 
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
@@ -45,6 +45,7 @@
     _BaseLazyModule,
     is_flax_available,
     is_sentencepiece_available,
+    is_speech_available,
     is_tf_available,
     is_tokenizers_available,
     is_torch_available,
@@ -102,6 +103,7 @@
         "is_py3nvml_available",
         "is_sentencepiece_available",
         "is_sklearn_available",
+        "is_speech_available",
         "is_tf_available",
         "is_tokenizers_available",
         "is_torch_available",
@@ -133,9 +135,11 @@
     "models.auto": [
         "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CONFIG_MAPPING",
+        "FEATURE_EXTRACTOR_MAPPING",
         "MODEL_NAMES_MAPPING",
         "TOKENIZER_MAPPING",
         "AutoConfig",
+        "AutoFeatureExtractor",
         "AutoTokenizer",
     ],
     "models.bart": ["BartConfig", "BartTokenizer"],
@@ -159,6 +163,7 @@
     ],
     "models.camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig"],
     "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
+    "models.cpm": ["CpmTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
@@ -187,6 +192,7 @@
     "models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
     "models.marian": ["MarianConfig"],
     "models.mbart": ["MBartConfig"],
+    "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
     "models.mmbt": ["MMBTConfig"],
     "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
     "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
@@ -202,7 +208,6 @@
     "models.speech_to_text": [
         "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Speech2TextConfig",
-        "Speech2TextFeatureExtractor",
     ],
     "models.squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig", "SqueezeBertTokenizer"],
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
@@ -288,7 +293,6 @@
     _import_structure["models.pegasus"].append("PegasusTokenizer")
     _import_structure["models.reformer"].append("ReformerTokenizer")
     _import_structure["models.speech_to_text"].append("Speech2TextTokenizer")
-    _import_structure["models.speech_to_text"].append("Speech2TextProcessor")
     _import_structure["models.t5"].append("T5Tokenizer")
     _import_structure["models.xlm_prophetnet"].append("XLMProphetNetTokenizer")
     _import_structure["models.xlm_roberta"].append("XLMRobertaTokenizer")
@@ -337,8 +341,6 @@
     _import_structure["models.xlnet"].append("XLNetTokenizerFast")
     _import_structure["tokenization_utils_fast"] = ["PreTrainedTokenizerFast"]
 
-    if is_sentencepiece_available():
-        _import_structure["convert_slow_tokenizer"] = ["SLOW_TO_FAST_CONVERTERS", "convert_slow_tokenizer"]
 else:
     from .utils import dummy_tokenizers_objects
 
@@ -346,6 +348,35 @@
         name for name in dir(dummy_tokenizers_objects) if not name.startswith("_")
     ]
 
+if is_sentencepiece_available() and is_tokenizers_available():
+    _import_structure["convert_slow_tokenizer"] = ["SLOW_TO_FAST_CONVERTERS", "convert_slow_tokenizer"]
+else:
+    from .utils import dummy_sentencepiece_and_tokenizers_objects
+
+    _import_structure["utils.dummy_sentencepiece_and_tokenizers_objects"] = [
+        name for name in dir(dummy_sentencepiece_and_tokenizers_objects) if not name.startswith("_")
+    ]
+
+# Speech-specific objects
+if is_speech_available():
+    _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
+
+else:
+    from .utils import dummy_speech_objects
+
+    _import_structure["utils.dummy_speech_objects"] = [
+        name for name in dir(dummy_speech_objects) if not name.startswith("_")
+    ]
+
+if is_sentencepiece_available() and is_speech_available():
+    _import_structure["models.speech_to_text"].append("Speech2TextProcessor")
+else:
+    from .utils import dummy_sentencepiece_and_speech_objects
+
+    _import_structure["utils.dummy_sentencepiece_and_speech_objects"] = [
+        name for name in dir(dummy_sentencepiece_and_speech_objects) if not name.startswith("_")
+    ]
+
 # Vision-specific objects
 if is_vision_available():
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
@@ -736,6 +767,20 @@
             "MBartModel",
         ]
     )
+    _import_structure["models.megatron_bert"].extend(
+        [
+            "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MegatronBertForCausalLM",
+            "MegatronBertForMaskedLM",
+            "MegatronBertForMultipleChoice",
+            "MegatronBertForNextSentencePrediction",
+            "MegatronBertForPreTraining",
+            "MegatronBertForQuestionAnswering",
+            "MegatronBertForSequenceClassification",
+            "MegatronBertForTokenClassification",
+            "MegatronBertModel",
+        ]
+    )
     _import_structure["models.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
     _import_structure["models.mobilebert"].extend(
         [
@@ -1394,6 +1439,7 @@
         is_py3nvml_available,
         is_sentencepiece_available,
         is_sklearn_available,
+        is_speech_available,
         is_tf_available,
         is_tokenizers_available,
         is_torch_available,
@@ -1429,9 +1475,11 @@
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
+        FEATURE_EXTRACTOR_MAPPING,
         MODEL_NAMES_MAPPING,
         TOKENIZER_MAPPING,
         AutoConfig,
+        AutoFeatureExtractor,
         AutoTokenizer,
     )
     from .models.bart import BartConfig, BartTokenizer
@@ -1454,6 +1502,7 @@
     )
     from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
     from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
+    from .models.cpm import CpmTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
@@ -1482,6 +1531,7 @@
     from .models.m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
     from .models.marian import MarianConfig
     from .models.mbart import MBartConfig
+    from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
     from .models.mmbt import MMBTConfig
     from .models.mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig, MobileBertTokenizer
     from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
@@ -1494,11 +1544,7 @@
     from .models.reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
     from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
     from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
-    from .models.speech_to_text import (
-        SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        Speech2TextConfig,
-        Speech2TextFeatureExtractor,
-    )
+    from .models.speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
     from .models.squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig, SqueezeBertTokenizer
     from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
     from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
@@ -1585,7 +1631,7 @@
         from .models.mt5 import MT5Tokenizer
         from .models.pegasus import PegasusTokenizer
         from .models.reformer import ReformerTokenizer
-        from .models.speech_to_text import Speech2TextProcessor, Speech2TextTokenizer
+        from .models.speech_to_text import Speech2TextTokenizer
         from .models.t5 import T5Tokenizer
         from .models.xlm_prophetnet import XLMProphetNetTokenizer
         from .models.xlm_roberta import XLMRobertaTokenizer
@@ -1625,11 +1671,25 @@
         from .models.xlnet import XLNetTokenizerFast
         from .tokenization_utils_fast import PreTrainedTokenizerFast
 
-        if is_sentencepiece_available():
-            from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, convert_slow_tokenizer
     else:
         from .utils.dummy_tokenizers_objects import *
 
+    if is_sentencepiece_available() and is_tokenizers_available():
+        from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, convert_slow_tokenizer
+    else:
+        from .utils.dummies_sentencepiece_and_tokenizers_objects import *
+
+    if is_speech_available():
+        from .models.speech_to_text import Speech2TextFeatureExtractor
+
+    else:
+        from .utils.dummy_speech_objects import *
+
+    if is_speech_available() and is_sentencepiece_available():
+        from .models.speech_to_text import Speech2TextProcessor
+    else:
+        from .utils.dummy_sentencepiece_and_speech_objects import *
+
     if is_vision_available():
         from .image_utils import ImageFeatureExtractionMixin
         from .models.vit import ViTFeatureExtractor
@@ -1957,6 +2017,18 @@
             MBartForSequenceClassification,
             MBartModel,
         )
+        from .models.megatron_bert import (
+            MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegatronBertForCausalLM,
+            MegatronBertForMaskedLM,
+            MegatronBertForMultipleChoice,
+            MegatronBertForNextSentencePrediction,
+            MegatronBertForPreTraining,
+            MegatronBertForQuestionAnswering,
+            MegatronBertForSequenceClassification,
+            MegatronBertForTokenClassification,
+            MegatronBertModel,
+        )
         from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
         from .models.mobilebert import (
             MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 9aa2440ce9dfe0..ad517ba1549639 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -262,7 +262,7 @@ def __init__(self, **kwargs):
 
         # TPU arguments
         if kwargs.pop("xla_device", None) is not None:
-            logger.warn(
+            logger.warning(
                 "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can "
                 "safely remove it from your `config.json` file."
             )
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index e98c635d04dccc..680f910d37a5fb 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -24,7 +24,7 @@
 from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-from .file_utils import requires_protobuf, requires_sentencepiece
+from .file_utils import requires_backends
 
 
 class SentencePieceExtractor:
@@ -33,7 +33,7 @@ class SentencePieceExtractor:
     """
 
     def __init__(self, model: str):
-        requires_sentencepiece(self)
+        requires_backends(self, "sentencepiece")
         from sentencepiece import SentencePieceProcessor
 
         self.sp = SentencePieceProcessor()
@@ -298,7 +298,7 @@ def converted(self) -> Tokenizer:
 
 class SpmConverter(Converter):
     def __init__(self, *args):
-        requires_protobuf(self)
+        requires_backends(self, "protobuf")
 
         super().__init__(*args)
 
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 94eaade7b158d9..9915eb5a5f3c81 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -192,7 +192,7 @@ def __call__(self, features):
         return batch
 
 
-def _collate_batch(examples, tokenizer):
+def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
     """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
     # Tensorize if necessary.
     if isinstance(examples[0], (list, tuple)):
@@ -201,7 +201,7 @@ def _collate_batch(examples, tokenizer):
     # Check if padding is necessary.
     length_of_first = examples[0].size(0)
     are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
-    if are_tensors_same_length:
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
         return torch.stack(examples, dim=0)
 
     # If yes, check if we have a `pad_token`.
@@ -213,6 +213,8 @@ def _collate_batch(examples, tokenizer):
 
     # Creating the full tensor and filling it with our data.
     max_length = max(x.size(0) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
     result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
     for i, example in enumerate(examples):
         if tokenizer.padding_side == "right":
@@ -311,6 +313,8 @@ class DataCollatorForLanguageModeling:
             non-masked tokens and the value to predict for the masked token.
         mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
             The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
 
     .. note::
 
@@ -323,6 +327,7 @@ class DataCollatorForLanguageModeling:
     tokenizer: PreTrainedTokenizerBase
     mlm: bool = True
     mlm_probability: float = 0.15
+    pad_to_multiple_of: Optional[int] = None
 
     def __post_init__(self):
         if self.mlm and self.tokenizer.mask_token is None:
@@ -336,9 +341,9 @@ def __call__(
     ) -> Dict[str, torch.Tensor]:
         # Handle dict or lists with proper padding and conversion to tensor.
         if isinstance(examples[0], (dict, BatchEncoding)):
-            batch = self.tokenizer.pad(examples, return_tensors="pt")
+            batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
         else:
-            batch = {"input_ids": _collate_batch(examples, self.tokenizer)}
+            batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)}
 
         # If special token mask has been preprocessed, pop it from the dict.
         special_tokens_mask = batch.pop("special_tokens_mask", None)
diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py
index 00f433e4a32b99..9665fb25c23ae1 100644
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@@ -152,7 +152,7 @@ def __init__(
                 )
 
                 if self.dataset is None or self.examples is None:
-                    logger.warn(
+                    logger.warning(
                         f"Deleting cached file {cached_features_file} will allow dataset and examples to be cached in future run"
                     )
             else:
diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py
index df4aa38ff34a2e..cd4bfdbddd1120 100644
--- a/src/transformers/data/metrics/__init__.py
+++ b/src/transformers/data/metrics/__init__.py
@@ -16,7 +16,7 @@
 
 import warnings
 
-from ...file_utils import is_sklearn_available, requires_sklearn
+from ...file_utils import is_sklearn_available, requires_backends
 
 
 if is_sklearn_available():
@@ -34,13 +34,13 @@
 
 def simple_accuracy(preds, labels):
     warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(simple_accuracy)
+    requires_backends(simple_accuracy, "sklearn")
     return (preds == labels).mean()
 
 
 def acc_and_f1(preds, labels):
     warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(acc_and_f1)
+    requires_backends(acc_and_f1, "sklearn")
     acc = simple_accuracy(preds, labels)
     f1 = f1_score(y_true=labels, y_pred=preds)
     return {
@@ -52,7 +52,7 @@ def acc_and_f1(preds, labels):
 
 def pearson_and_spearman(preds, labels):
     warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(pearson_and_spearman)
+    requires_backends(pearson_and_spearman, "sklearn")
     pearson_corr = pearsonr(preds, labels)[0]
     spearman_corr = spearmanr(preds, labels)[0]
     return {
@@ -64,7 +64,7 @@ def pearson_and_spearman(preds, labels):
 
 def glue_compute_metrics(task_name, preds, labels):
     warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(glue_compute_metrics)
+    requires_backends(glue_compute_metrics, "sklearn")
     assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
     if task_name == "cola":
         return {"mcc": matthews_corrcoef(labels, preds)}
@@ -94,7 +94,7 @@ def glue_compute_metrics(task_name, preds, labels):
 
 def xnli_compute_metrics(task_name, preds, labels):
     warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(xnli_compute_metrics)
+    requires_backends(xnli_compute_metrics, "sklearn")
     assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
     if task_name == "xnli":
         return {"acc": simple_accuracy(preds, labels)}
diff --git a/src/transformers/dependency_versions_check.py b/src/transformers/dependency_versions_check.py
index 7e36aaef3091ba..e6e676481d79c9 100644
--- a/src/transformers/dependency_versions_check.py
+++ b/src/transformers/dependency_versions_check.py
@@ -14,7 +14,7 @@
 import sys
 
 from .dependency_versions_table import deps
-from .utils.versions import require_version_core
+from .utils.versions import require_version, require_version_core
 
 
 # define which module versions we always want to check at run time
@@ -41,3 +41,7 @@
         require_version_core(deps[pkg])
     else:
         raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
+
+
+def dep_version_check(pkg, hint=None):
+    require_version(deps[pkg], hint)
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index c7a4bd41d644a1..82968ff299491a 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -2,11 +2,14 @@
 # 1. modify the `_deps` dict in setup.py
 # 2. run `make deps_table_update``
 deps = {
+    "Pillow": "Pillow",
     "black": "black>=20.8b1",
     "cookiecutter": "cookiecutter==1.7.2",
     "dataclasses": "dataclasses",
     "datasets": "datasets",
+    "deepspeed": "deepspeed>0.3.13",
     "docutils": "docutils==0.16.0",
+    "fairscale": "fairscale>0.3",
     "faiss-cpu": "faiss-cpu",
     "fastapi": "fastapi",
     "filelock": "filelock",
@@ -18,14 +21,15 @@
     "isort": "isort>=5.5.4",
     "jax": "jax>=0.2.8",
     "jaxlib": "jaxlib>=0.1.59",
+    "jieba": "jieba",
     "keras2onnx": "keras2onnx",
+    "nltk": "nltk",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
     "onnxruntime": "onnxruntime>=1.4.0",
     "packaging": "packaging",
     "parameterized": "parameterized",
-    "Pillow": "Pillow",
     "protobuf": "protobuf",
     "psutil": "psutil",
     "pydantic": "pydantic",
@@ -36,7 +40,10 @@
     "recommonmark": "recommonmark",
     "regex": "regex!=2019.12.17",
     "requests": "requests",
+    "rouge-score": "rouge-score",
+    "sacrebleu": "sacrebleu>=1.4.12",
     "sacremoses": "sacremoses",
+    "sagemaker": "sagemaker>=2.31.0",
     "scikit-learn": "scikit-learn",
     "sentencepiece": "sentencepiece==0.1.91",
     "soundfile": "soundfile",
@@ -44,6 +51,7 @@
     "sphinx-markdown-tables": "sphinx-markdown-tables",
     "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3",
     "sphinx": "sphinx==3.2.1",
+    "sphinxext-opengraph": "sphinxext-opengraph==0.4.1",
     "starlette": "starlette",
     "tensorflow-cpu": "tensorflow-cpu>=2.3",
     "tensorflow": "tensorflow>=2.3",
@@ -55,5 +63,4 @@
     "unidic": "unidic>=1.0.2",
     "unidic_lite": "unidic_lite>=1.0.7",
     "uvicorn": "uvicorn",
-    "sagemaker": "sagemaker>=2.31.0",
 }
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index dbd5f9a6ccd36b..f7bf49c4009dbe 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -325,6 +325,13 @@ def get_feature_extractor_dict(
         local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
 
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        user_agent = {"file_type": "feature extractor", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
         if is_offline_mode() and not local_files_only:
             logger.info("Offline mode: forcing local_files_only=True")
             local_files_only = True
@@ -349,6 +356,7 @@ def get_feature_extractor_dict(
                 resume_download=resume_download,
                 local_files_only=local_files_only,
                 use_auth_token=use_auth_token,
+                user_agent=user_agent,
             )
             # Load feature_extractor dict
             with open(resolved_feature_extractor_file, "r", encoding="utf-8") as reader:
@@ -426,6 +434,7 @@ def to_dict(self) -> Dict[str, Any]:
             :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
         """
         output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
 
         return output
 
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index ed4b84dc108da8..cd61dc897e352c 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -194,7 +194,7 @@
     and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
     and "TRANSFORMERS_CACHE" not in os.environ
 ):
-    logger.warn(
+    logger.warning(
         "In Transformers v4.0.0, the default path to cache downloaded models changed from "
         "'~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden "
         "and '~/.cache/torch/transformers' is a directory that exists, we're moving it to "
@@ -397,6 +397,11 @@ def is_torchaudio_available():
     return _torchaudio_available
 
 
+def is_speech_available():
+    # For now this depends on torchaudio but the exact dependency might evolve in the future.
+    return _torchaudio_available
+
+
 def torch_only_method(fn):
     def wrapper(*args, **kwargs):
         if not _torch_available:
@@ -513,6 +518,13 @@ def wrapper(*args, **kwargs):
 """
 
 
+# docstyle-ignore
+SPEECH_IMPORT_ERROR = """
+{0} requires the torchaudio library but it was not found in your environment. You can install it with pip:
+`pip install torchaudio`
+"""
+
+
 # docstyle-ignore
 VISION_IMPORT_ERROR = """
 {0} requires the PIL library but it was not found in your environment. You can install it with pip:
@@ -520,76 +532,32 @@ def wrapper(*args, **kwargs):
 """
 
 
-def requires_datasets(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_datasets_available():
-        raise ImportError(DATASETS_IMPORT_ERROR.format(name))
-
-
-def requires_faiss(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_faiss_available():
-        raise ImportError(FAISS_IMPORT_ERROR.format(name))
-
-
-def requires_pytorch(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_torch_available():
-        raise ImportError(PYTORCH_IMPORT_ERROR.format(name))
-
-
-def requires_sklearn(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_sklearn_available():
-        raise ImportError(SKLEARN_IMPORT_ERROR.format(name))
-
-
-def requires_tf(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_tf_available():
-        raise ImportError(TENSORFLOW_IMPORT_ERROR.format(name))
-
-
-def requires_flax(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_flax_available():
-        raise ImportError(FLAX_IMPORT_ERROR.format(name))
-
-
-def requires_tokenizers(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_tokenizers_available():
-        raise ImportError(TOKENIZERS_IMPORT_ERROR.format(name))
-
-
-def requires_sentencepiece(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_sentencepiece_available():
-        raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))
-
-
-def requires_protobuf(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_protobuf_available():
-        raise ImportError(PROTOBUF_IMPORT_ERROR.format(name))
-
-
-def requires_pandas(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_pandas_available():
-        raise ImportError(PANDAS_IMPORT_ERROR.format(name))
-
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
+        ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
+        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
+        ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
+        ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
+        ("scatter", (is_scatter_available, SCATTER_IMPORT_ERROR)),
+        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+        ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
+        ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
+        ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
+        ("tokenziers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
+        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
+    ]
+)
 
-def requires_scatter(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_scatter_available():
-        raise ImportError(SCATTER_IMPORT_ERROR.format(name))
 
+def requires_backends(obj, backends):
+    if not isinstance(backends, (list, tuple)):
+        backends = [backends]
 
-def requires_vision(obj):
     name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_vision_available():
-        raise ImportError(VISION_IMPORT_ERROR.format(name))
+    if not all(BACKENDS_MAPPING[backend][0]() for backend in backends):
+        raise ImportError("".join([BACKENDS_MAPPING[backend][1].format(name) for backend in backends]))
 
 
 def add_start_docstrings(*docstr):
diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index c808d3ae4f6060..1d790b287823ef 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -39,8 +39,8 @@
 
             `What are input IDs? <../glossary.html#input-ids>`__
         scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
-            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
-            or scores for each vocabulary token after SoftMax.
+            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+            search or log softmax for each vocabulary token when using beam search
         kwargs:
             Additional logits processor specific kwargs.
 
@@ -77,7 +77,7 @@ class LogitsProcessorList(list):
     This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
     :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
     list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
-    :class:`~transformers.LogitsProcessor` to the inputs.
+    :class:`~transformers.LogitsWarper` to the inputs.
     """
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 086ad26992fefd..804d989b5412a9 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -18,6 +18,7 @@
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
+import torch.distributed as dist
 from torch.nn import functional as F
 
 from .file_utils import ModelOutput
@@ -695,6 +696,7 @@ def generate(
         forced_bos_token_id: Optional[int] = None,
         forced_eos_token_id: Optional[int] = None,
         remove_invalid_values: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
         **model_kwargs,
     ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, torch.LongTensor]:
         r"""
@@ -800,6 +802,8 @@ def generate(
             remove_invalid_values (:obj:`bool`, `optional`):
                 Whether to remove possible `nan` and `inf` outputs of the model to prevent the generation method to
                 crash. Note that using ``remove_invalid_values`` can slow down generation.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
 
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If the
@@ -1000,6 +1004,7 @@ def generate(
                 eos_token_id=eos_token_id,
                 output_scores=output_scores,
                 return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                 **model_kwargs,
             )
 
@@ -1028,6 +1033,7 @@ def generate(
                 eos_token_id=eos_token_id,
                 output_scores=output_scores,
                 return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                 **model_kwargs,
             )
 
@@ -1063,6 +1069,7 @@ def generate(
                 eos_token_id=eos_token_id,
                 output_scores=output_scores,
                 return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                 **model_kwargs,
             )
 
@@ -1102,6 +1109,7 @@ def generate(
                 eos_token_id=eos_token_id,
                 output_scores=output_scores,
                 return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                 **model_kwargs,
             )
 
@@ -1141,6 +1149,7 @@ def generate(
                 eos_token_id=eos_token_id,
                 output_scores=output_scores,
                 return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                 **model_kwargs,
             )
 
@@ -1156,13 +1165,12 @@ def greedy_search(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
         **model_kwargs,
     ) -> Union[GreedySearchOutput, torch.LongTensor]:
         r"""
         Generates sequences for models with a language modeling head using greedy decoding.
 
-
-
         Parameters:
 
             input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1175,6 +1183,7 @@ def greedy_search(
             stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
                 An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
                 :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+
             max_length (:obj:`int`, `optional`, defaults to 20):
                 The maximum length of the sequence to be generated.
             pad_token_id (:obj:`int`, `optional`):
@@ -1191,6 +1200,8 @@ def greedy_search(
                 Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
             return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                 Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the :obj:`forward` function of the
                 model. If model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -1265,7 +1276,19 @@ def greedy_search(
             input_ids, max_length
         )
 
+        this_peer_finished = False  # used by synced_gpus only
         while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
             # prepare model inputs
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
@@ -1276,6 +1299,11 @@ def greedy_search(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
             )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
             next_token_logits = outputs.logits[:, -1, :]
 
             # Store scores, attentions and hidden_states when required
@@ -1321,16 +1349,16 @@ def greedy_search(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
 
-            # stop when there is a </s> in each sentence, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0:
-                break
-
-            if stopping_criteria(input_ids, scores):
-                break
-
             # increase cur_len
             cur_len = cur_len + 1
 
+            # stop when there is a </s> in each sentence, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
         if return_dict_in_generate:
             if self.config.is_encoder_decoder:
                 return GreedySearchEncoderDecoderOutput(
@@ -1365,6 +1393,7 @@ def sample(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
         **model_kwargs,
     ) -> Union[SampleOutput, torch.LongTensor]:
         r"""
@@ -1402,6 +1431,8 @@ def sample(
                 Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
             return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                 Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
                 model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -1485,8 +1516,20 @@ def sample(
             input_ids, max_length
         )
 
+        this_peer_finished = False  # used by synced_gpus only
         # auto-regressive generation
         while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
             # prepare model inputs
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
@@ -1497,6 +1540,11 @@ def sample(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
             )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
             next_token_logits = outputs.logits[:, -1, :]
 
             # pre-process distribution
@@ -1533,7 +1581,6 @@ def sample(
 
             # add token and increase length by one
             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            cur_len = cur_len + 1
 
             # update sequence length
             if eos_token_id is not None:
@@ -1541,18 +1588,21 @@ def sample(
                     sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
                 )
 
-            # stop when there is a </s> in each sentence, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0:
-                break
-
-            if stopping_criteria(input_ids, scores):
-                break
-
             # update model kwargs
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
 
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            # stop when there is a </s> in each sentence, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
         if return_dict_in_generate:
             if self.config.is_encoder_decoder:
                 return SampleEncoderDecoderOutput(
@@ -1587,6 +1637,7 @@ def beam_search(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
         **model_kwargs,
     ) -> Union[BeamSearchOutput, torch.LongTensor]:
         r"""
@@ -1624,6 +1675,8 @@ def beam_search(
                 Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
             return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                 Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
                 model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -1726,7 +1779,19 @@ def beam_search(
         beam_scores[:, 1:] = -1e9
         beam_scores = beam_scores.view((batch_size * num_beams,))
 
+        this_peer_finished = False  # used by synced_gpus only
         while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             outputs = self(
@@ -1735,6 +1800,11 @@ def beam_search(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
             )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
             next_token_logits = outputs.logits[:, -1, :]
 
             # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
@@ -1792,19 +1862,20 @@ def beam_search(
 
             input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
 
-            cur_len = cur_len + 1
-
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
             if model_kwargs["past"] is not None:
                 model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx)
 
-            if beam_scorer.is_done:
-                break
+            # increase cur_len
+            cur_len = cur_len + 1
 
-            if stopping_criteria(input_ids, scores):
-                break
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
 
         sequence_outputs = beam_scorer.finalize(
             input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id
@@ -1849,6 +1920,7 @@ def beam_sample(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
         **model_kwargs,
     ) -> Union[BeamSampleOutput, torch.LongTensor]:
         r"""
@@ -1890,6 +1962,8 @@ def beam_sample(
                 Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
             return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                 Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
                 model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -1993,7 +2067,19 @@ def beam_sample(
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
         beam_scores = beam_scores.view((batch_size * num_beams,))
 
+        this_peer_finished = False  # used by synced_gpus only
         while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
             outputs = self(
@@ -2002,6 +2088,11 @@ def beam_sample(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
             )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
             next_token_logits = outputs.logits[:, -1, :]
 
             # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
@@ -2063,7 +2154,6 @@ def beam_sample(
             beam_idx = beam_outputs["next_beam_indices"]
 
             input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-            cur_len = cur_len + 1
 
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
@@ -2071,11 +2161,14 @@ def beam_sample(
             if model_kwargs["past"] is not None:
                 model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx)
 
-            if beam_scorer.is_done:
-                break
+            # increase cur_len
+            cur_len = cur_len + 1
 
-            if stopping_criteria(input_ids, scores):
-                break
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
 
         sequence_outputs = beam_scorer.finalize(
             input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id
@@ -2119,6 +2212,7 @@ def group_beam_search(
         output_hidden_states: Optional[bool] = None,
         output_scores: Optional[bool] = None,
         return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
         **model_kwargs,
     ):
         r"""
@@ -2156,6 +2250,9 @@ def group_beam_search(
                 Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
             return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                 Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+
             model_kwargs:
                 Additional model specific kwargs that will be forwarded to the :obj:`forward` function of the model. If
                 model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -2266,7 +2363,19 @@ def group_beam_search(
         beam_scores[:, ::num_sub_beams] = 0
         beam_scores = beam_scores.view((batch_size * num_beams,))
 
+        this_peer_finished = False  # used by synced_gpus only
         while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
             # predicted tokens in cur_len step
             current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
 
@@ -2282,6 +2391,10 @@ def group_beam_search(
                 output_hidden_states=output_hidden_states,
             )
 
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
             for beam_group_idx in range(num_beam_groups):
                 group_start_idx = beam_group_idx * num_sub_beams
                 group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
@@ -2372,19 +2485,22 @@ def group_beam_search(
                         else (outputs.hidden_states,)
                     )
 
+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
             if model_kwargs["past"] is not None:
                 model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], reordering_indices)
 
-            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+            # increase cur_len
             cur_len = cur_len + 1
-            if beam_scorer.is_done:
-                break
 
-            if stopping_criteria(input_ids, scores):
-                break
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
 
         sequence_outputs = beam_scorer.finalize(
             input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 57336f8fe71e1f..7e4ab0f5c7a100 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -19,12 +19,13 @@
 import json
 import numbers
 import os
+import sys
 import tempfile
 from copy import deepcopy
 from pathlib import Path
 
+from .dependency_versions_check import dep_version_check
 from .utils import logging
-from .utils.versions import require_version
 
 
 logger = logging.get_logger(__name__)
@@ -54,7 +55,7 @@
 def is_wandb_available():
     # any value of WANDB_DISABLED disables wandb
     if os.getenv("WANDB_DISABLED", "").upper() in ENV_VARS_TRUE_VALUES:
-        logger.warn(
+        logger.warning(
             "Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the "
             "--report_to flag to control the integrations used for logging result (for instance --report_to none)."
         )
@@ -268,7 +269,77 @@ def rewrite_logs(d):
     return new_d
 
 
-def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
+_is_deepspeed_zero3_enabled = None
+
+
+def is_deepspeed_zero3_enabled():
+    """
+    This function answers to the question of whether DeepSpeed is going to be used and run using ZeRO Stage 3.
+
+    It includes an auto-discovery method, see comments in the code for details.
+
+    Returns: ``True`` if either it was explicitly enabled via ``deepspeed_zero3_enable(True)`` or the auto-detector was
+    able to derive that the ``Trainer`` will be running via DeepSpeed ZeRO stage 3.
+    """
+    global _is_deepspeed_zero3_enabled
+    if _is_deepspeed_zero3_enabled is None:
+        _is_deepspeed_zero3_enabled = False
+        # Try to auto-discover if we are about to use DeepSpeed with ZeRO3 enabled. This will only
+        # work for scripts using cli to pass --deepspeed ds_config.json. If cmd args aren't used,
+        # then to get the model efficiently loaded across multiple-gpus one has to explicitly call
+        # is_deepspeed_zero3_enabled(True) **before** instantiating a model object
+        if "--deepspeed" in sys.argv:
+            idx = sys.argv.index("--deepspeed")
+            ds_config = sys.argv[idx + 1]
+            if not os.path.exists(ds_config):
+                raise ValueError("--deepspeed requires a valid path to a config file")
+            config = deepspeed_parse_config(ds_config)
+            if (
+                "zero_optimization" in config
+                and "stage" in config["zero_optimization"]
+                and config["zero_optimization"]["stage"] == 3
+            ):
+                _is_deepspeed_zero3_enabled = True
+
+    return _is_deepspeed_zero3_enabled
+
+
+def deepspeed_zero3_enable(enable=True):
+    """
+    ``is_deepspeed_zero3_enabled()`` tries to derive automatically if DeepSpeed ZeRO 3 is going to be used by looking
+    at ``sys.argv`` which may or may contain information about where to find the DeepSpeed config if any.
+
+    This function allows for explicit enabling/disabling of this global flag.
+
+    Args:
+        enable: if set to ``True`` will make ``is_deepspeed_zero3_enabled()`` return ``True``
+    """
+    global _is_deepspeed_zero3_enabled
+    _is_deepspeed_zero3_enabled = enable
+
+
+def deepspeed_parse_config(ds_config):
+    """
+    If ``ds_config`` isn't already a dict, read it from the config file.
+
+    If it's already a dict, return a copy of it, so that we can freely modify it.
+    """
+    dep_version_check("deepspeed")
+
+    if isinstance(ds_config, dict):
+        # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
+        # modified it, it will not be accepted here again, since some config params must be not set by users
+        config = deepcopy(ds_config)
+    elif isinstance(ds_config, str):
+        with io.open(ds_config, "r", encoding="utf-8") as f:
+            config = json.load(f)
+    else:
+        raise ValueError("expecting either a path to a config file or a pre-populated dict")
+
+    return config
+
+
+def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
     """
     Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
 
@@ -284,21 +355,10 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
     """
     import deepspeed
 
-    require_version("deepspeed>0.3.12")
-
     args = trainer.args
-    ds_config_file = args.deepspeed
     model = trainer.model
 
-    if isinstance(args.deepspeed, dict):
-        # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
-        # modified it, it will not be accepted here again, since some config params must be not set by users
-        config = deepcopy(args.deepspeed)
-    elif isinstance(args.deepspeed, str):
-        with io.open(ds_config_file, "r", encoding="utf-8") as f:
-            config = json.load(f)
-    else:
-        raise ValueError("expecting either a path to a config file or a pre-populated dict")
+    config = deepspeed_parse_config(args.deepspeed)
 
     # The following code translates relevant trainer's cl args into the DS config
 
@@ -324,9 +384,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
     config["gradient_accumulation_steps"] = args.gradient_accumulation_steps
 
     if "gradient_clipping" in config:
-        logger.info(
-            f"Keeping the `gradient_clipping` config from {ds_config_file} intact, ignoring any gradient clipping-specific cl args"
-        )
+        logger.info("Keeping the `gradient_clipping` config intact, ignoring any gradient clipping-specific cl args")
     else:  # override only if the ds config doesn't already have this section
         config["gradient_clipping"] = args.max_grad_norm
 
@@ -336,6 +394,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
     # 2. HF scheduler + HF optimizer: Yes
     # 3. DS scheduler + HF optimizer: Yes
     # 4. HF scheduler + DS optimizer: No
+    #
     # Unless Offload is enabled in which case it's:
     # 1. DS scheduler + DS optimizer: Yes
     # 2. HF scheduler + HF optimizer: No
@@ -344,7 +403,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
 
     optimizer = None
     if "optimizer" in config:
-        logger.info(f"Updating the `scheduler` config from {ds_config_file} with other command line arguments")
+        logger.info("Updating the `scheduler` config with other command line arguments")
 
         # to avoid inconsistent values of lr and warm up steps the command line args override config
         params = dict(
@@ -384,7 +443,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
     # WarmupDecayLR| linear               | get_linear_schedule_with_warmup   |
     lr_scheduler = None
     if "scheduler" in config:
-        logger.info(f"Updating the `scheduler` config from {ds_config_file} with other command line arguments")
+        logger.info("Updating the `scheduler` config with other command line arguments")
         # the user won't easily know the correct num_training_steps should they use WarmupDecayLR,
         # so let's set it to the correct value
         if config["scheduler"]["type"] == "WarmupDecayLR":
@@ -417,9 +476,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
         # - `amp`: which delegates amp work to apex (which needs to be available), but it cannot be used with any ZeRO features, so probably best to be avoided.
         if trainer.fp16_backend == "apex":
             if "amp" in config:
-                logger.info(
-                    f"Keeping the `amp` config from {ds_config_file} intact, ignoring any amp-specific cl args"
-                )
+                logger.info("Keeping the `amp` config intact, ignoring any amp-specific cl args")
             else:
                 config["amp"] = {
                     "enabled": True,
@@ -427,19 +484,33 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
                 }
         elif trainer.fp16_backend == "amp":
             if "fp16" in config:
-                logger.info(
-                    f"Keeping the `fp16` config from {ds_config_file} intact, ignoring any fp16-specific cl args"
-                )
+                logger.info("Keeping the `fp16` config intact, ignoring any fp16-specific cl args")
             else:
                 config["fp16"] = {
                     "enabled": True,
                 }
 
+    # zero
+    if "zero_optimization" in config:
+        zero = config["zero_optimization"]
+
+        # now we know for sure if zero3 is enabled
+        deepspeed_zero3_enable(zero.get("stage") == 3)
+
+        # automatically assign the optimal config values based on model config
+        hidden_size = model.config.hidden_size
+        if zero.get("reduce_bucket_size") == 0:
+            zero["reduce_bucket_size"] = hidden_size * hidden_size
+        if zero.get("stage3_prefetch_bucket_size") == 0:
+            zero["stage3_prefetch_bucket_size"] = 0.9 * hidden_size * hidden_size
+        if zero.get("stage3_param_persistence_threshold") == 0:
+            zero["stage3_param_persistence_threshold"] = 10 * hidden_size
+
     # keep for quick debug:
     # from pprint import pprint; pprint(config)
 
-    # init that takes part of the config via `args`, and the bulk of it via `config_params`
     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
+
     model, optimizer, _, lr_scheduler = deepspeed.initialize(
         model=model,
         model_parameters=model_parameters,
@@ -448,14 +519,26 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
         lr_scheduler=lr_scheduler,
     )
 
-    if resume_from_checkpoint is not None:  # and os.path.isdir(resume_from_checkpoint):
-        logger.info(f"Attempting to resume from {resume_from_checkpoint}")
-        # this magically updates self.optimizer and self.lr_scheduler
-        load_path, _ = model.load_checkpoint(
-            resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True
-        )
-        if load_path is None:
-            raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}")
+    if resume_from_checkpoint is not None:
+
+        # it's possible that the user is trying to resume from model_path, which doesn't necessarily
+        # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
+        # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
+        # path contains what looks like a deepspeed checkpoint
+        import glob
+
+        deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*"))
+
+        if len(deepspeed_checkpoint_dirs) > 0:
+            logger.info(f"Attempting to resume from {resume_from_checkpoint}")
+            # this magically updates self.optimizer and self.lr_scheduler
+            load_path, _ = model.load_checkpoint(
+                resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True
+            )
+            if load_path is None:
+                raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}")
+        else:
+            logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing")
 
     return model, optimizer, lr_scheduler
 
@@ -521,9 +604,11 @@ def on_train_begin(self, args, state, control, **kwargs):
                 self.tb_writer.add_hparams(args.to_sanitized_dict(), metric_dict={})
 
     def on_log(self, args, state, control, logs=None, **kwargs):
-        if state.is_world_process_zero:
-            if self.tb_writer is None:
-                self._init_summary_writer(args)
+        if not state.is_world_process_zero:
+            return
+
+        if self.tb_writer is None:
+            self._init_summary_writer(args)
 
         if self.tb_writer is not None:
             logs = rewrite_logs(logs)
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index c425f1a0006284..b9464ad3e5847f 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -387,6 +387,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike]):
         # get abs dir
         save_directory = os.path.abspath(save_directory)
         # save config as well
+        self.config.architectures = [self.__class__.__name__[4:]]
         self.config.save_pretrained(save_directory)
 
         # save model
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 36e2b403b48738..002a7667f20487 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -290,7 +290,7 @@ def booleans_processing(config, **kwargs):
             or kwargs["output_hidden_states"] is not None
             or ("use_cache" in kwargs and kwargs["use_cache"] is not None)
         ):
-            tf_logger.warn(
+            tf_logger.warning(
                 "The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model."
                 "They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`)."
             )
@@ -299,7 +299,9 @@ def booleans_processing(config, **kwargs):
         final_booleans["output_hidden_states"] = config.output_hidden_states
 
         if kwargs["return_dict"] is not None:
-            tf_logger.warn("The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.")
+            tf_logger.warning(
+                "The parameter `return_dict` cannot be set in graph mode and will always be set to `True`."
+            )
         final_booleans["return_dict"] = True
 
         if "use_cache" in kwargs:
@@ -398,7 +400,7 @@ def input_processing(func, config, input_ids, **kwargs):
             if isinstance(v, allowed_types) or v is None:
                 output[k] = v
             elif k not in parameter_names and "args" not in parameter_names:
-                logger.warn(
+                logger.warning(
                     f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored."
                 )
                 continue
@@ -1035,6 +1037,7 @@ def save_pretrained(self, save_directory, saved_model=False, version=1):
             logger.info(f"Saved model created in {saved_model_dir}")
 
         # Save configuration file
+        self.config.architectures = [self.__class__.__name__[2:]]
         self.config.save_pretrained(save_directory)
 
         # If we save using the predefined names, we can load using `from_pretrained`
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index fdc2ea1dc7b32c..b34b2d4f071644 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -41,6 +41,7 @@
     replace_return_docstrings,
 )
 from .generation_utils import GenerationMixin
+from .integrations import is_deepspeed_zero3_enabled
 from .utils import logging
 
 
@@ -660,7 +661,14 @@ def _get_resized_embeddings(
         if new_num_tokens is None:
             return old_embeddings
 
-        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=None):
+                old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        else:
+            old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+
         if old_num_tokens == new_num_tokens:
             return old_embeddings
 
@@ -677,8 +685,17 @@ def _get_resized_embeddings(
         self._init_weights(new_embeddings)
 
         # Copy token embeddings from the previous weights
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        # numbers of tokens to copy
+        n = min(old_num_tokens, new_num_tokens)
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=0):
+                if torch.distributed.get_rank() == 0:
+                    new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
+        else:
+            new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
 
         return new_embeddings
 
@@ -1056,7 +1073,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         config.name_or_path = pretrained_model_name_or_path
 
         # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
+            # this immediately partitions the model to avoid the overhead in time and memory copying it on CPU or each GPU first
+            with deepspeed.zero.Init():
+                model = cls(config, *model_args, **model_kwargs)
+        else:
+            model = cls(config, *model_args, **model_kwargs)
 
         if state_dict is None and not from_tf:
             try:
@@ -1114,15 +1140,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             # so we need to apply the function recursively.
             def load(module: nn.Module, prefix=""):
                 local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-                module._load_from_state_dict(
-                    state_dict,
-                    prefix,
-                    local_metadata,
-                    True,
-                    missing_keys,
-                    unexpected_keys,
-                    error_msgs,
-                )
+                args = (state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                if is_deepspeed_zero3_enabled():
+                    import deepspeed
+
+                    # because zero3 puts placeholders in model params, this context
+                    # manager gathers (unpartitions) the params of the current layer, then loads from
+                    # the state dict and then re-partitions them again
+                    with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
+                        if torch.distributed.get_rank() == 0:
+                            module._load_from_state_dict(*args)
+                else:
+                    module._load_from_state_dict(*args)
+
                 for name, child in module._modules.items():
                     if child is not None:
                         load(child, prefix + name + ".")
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index efc6aedef39105..0092c46a976768 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -30,6 +30,7 @@
     blenderbot_small,
     camembert,
     convbert,
+    cpm,
     ctrl,
     deberta,
     dialogpt,
@@ -50,6 +51,7 @@
     m2m_100,
     marian,
     mbart,
+    megatron_bert,
     mmbt,
     mobilebert,
     mpnet,
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index a271f860644320..92c06bbcde6314 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -267,12 +267,9 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is not None:
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index 1d6e82b12d9bb9..cb817ddcc01fdb 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -184,37 +184,6 @@ def build_inputs_with_special_tokens(
             return cls + token_ids_0 + sep
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
     def create_token_type_ids_from_sequences(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 8bf312231a75b4..4abf6da50d8c79 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -22,7 +22,9 @@
 
 
 _import_structure = {
+    "auto_factory": ["get_values"],
     "configuration_auto": ["ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"],
+    "feature_extraction_auto": ["FEATURE_EXTRACTOR_MAPPING", "AutoFeatureExtractor"],
     "tokenization_auto": ["TOKENIZER_MAPPING", "AutoTokenizer"],
 }
 
@@ -103,7 +105,9 @@
 
 
 if TYPE_CHECKING:
+    from .auto_factory import get_values
     from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig
+    from .feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
     from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 
     if is_torch_available():
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 1c96f13199e82f..4ec9b6c31c16b1 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -328,6 +328,26 @@
 """
 
 
+def _get_model_class(config, model_mapping):
+    supported_models = model_mapping[type(config)]
+    if not isinstance(supported_models, (list, tuple)):
+        return supported_models
+
+    name_to_model = {model.__name__: model for model in supported_models}
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        if arch in name_to_model:
+            return name_to_model[arch]
+        elif f"TF{arch}" in name_to_model:
+            return name_to_model[f"TF{arch}"]
+        elif f"Flax{arch}" in name_to_model:
+            return name_to_model[f"Flax{arch}"]
+
+    # If not architecture is set in the config or match the supported models, the first element of the tuple is the
+    # defaults.
+    return supported_models[0]
+
+
 class _BaseAutoModelClass:
     # Base class for auto models.
     _model_mapping = None
@@ -341,7 +361,8 @@ def __init__(self):
 
     def from_config(cls, config, **kwargs):
         if type(config) in cls._model_mapping.keys():
-            return cls._model_mapping[type(config)](config, **kwargs)
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class(config, **kwargs)
         raise ValueError(
             f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
             f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
@@ -356,9 +377,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             )
 
         if type(config) in cls._model_mapping.keys():
-            return cls._model_mapping[type(config)].from_pretrained(
-                pretrained_model_name_or_path, *model_args, config=config, **kwargs
-            )
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
         raise ValueError(
             f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
             f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
@@ -418,3 +438,14 @@ def auto_class_factory(name, model_mapping, checkpoint_for_example="bert-base-ca
     from_pretrained = replace_list_option_in_docstrings(model_mapping)(from_pretrained)
     new_class.from_pretrained = classmethod(from_pretrained)
     return new_class
+
+
+def get_values(model_mapping):
+    result = []
+    for model in model_mapping.values():
+        if isinstance(model, (list, tuple)):
+            result += list(model)
+        else:
+            result.append(model)
+
+    return result
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index b6bf0ad2239538..aa095c4e6a7849 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -50,6 +50,7 @@
 from ..m2m_100.configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
 from ..marian.configuration_marian import MarianConfig
 from ..mbart.configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
+from ..megatron_bert.configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
 from ..mobilebert.configuration_mobilebert import MobileBertConfig
 from ..mpnet.configuration_mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig
 from ..mt5.configuration_mt5 import MT5Config
@@ -85,6 +86,7 @@
         # Add archive maps here
         GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         VIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -155,6 +157,7 @@
         ("pegasus", PegasusConfig),
         ("marian", MarianConfig),
         ("mbart", MBartConfig),
+        ("megatron_bert", MegatronBertConfig),
         ("mpnet", MPNetConfig),
         ("bart", BartConfig),
         ("blenderbot", BlenderbotConfig),
@@ -211,6 +214,7 @@
         ("blenderbot", "Blenderbot"),
         ("marian", "Marian"),
         ("mbart", "mBART"),
+        ("megatron_bert", "MegatronBert"),
         ("bart", "BART"),
         ("reformer", "Reformer"),
         ("longformer", "Longformer"),
@@ -243,29 +247,38 @@
 )
 
 
+def _get_class_name(model_class):
+    if isinstance(model_class, (list, tuple)):
+        return " or ".join([f":class:`~transformers.{c.__name__}`" for c in model_class])
+    return f":class:`~transformers.{model_class.__name__}`"
+
+
 def _list_model_options(indent, config_to_class=None, use_model_types=True):
     if config_to_class is None and not use_model_types:
         raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.")
     if use_model_types:
         if config_to_class is None:
-            model_type_to_name = {model_type: config.__name__ for model_type, config in CONFIG_MAPPING.items()}
+            model_type_to_name = {
+                model_type: f":class:`~transformers.{config.__name__}`"
+                for model_type, config in CONFIG_MAPPING.items()
+            }
         else:
             model_type_to_name = {
-                model_type: config_to_class[config].__name__
+                model_type: _get_class_name(config_to_class[config])
                 for model_type, config in CONFIG_MAPPING.items()
                 if config in config_to_class
             }
         lines = [
-            f"{indent}- **{model_type}** -- :class:`~transformers.{model_type_to_name[model_type]}` ({MODEL_NAMES_MAPPING[model_type]} model)"
+            f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)"
             for model_type in sorted(model_type_to_name.keys())
         ]
     else:
-        config_to_name = {config.__name__: clas.__name__ for config, clas in config_to_class.items()}
+        config_to_name = {config.__name__: _get_class_name(clas) for config, clas in config_to_class.items()}
         config_to_model_name = {
             config.__name__: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING.items()
         }
         lines = [
-            f"{indent}- :class:`~transformers.{config_name}` configuration class: :class:`~transformers.{config_to_name[config_name]}` ({config_to_model_name[config_name]} model)"
+            f"{indent}- :class:`~transformers.{config_name}` configuration class: {config_to_name[config_name]} ({config_to_model_name[config_name]} model)"
             for config_name in sorted(config_to_name.keys())
         ]
     return "\n".join(lines)
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
new file mode 100644
index 00000000000000..097a336c96dba6
--- /dev/null
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" AutoFeatureExtractor class. """
+
+from collections import OrderedDict
+
+from ...feature_extraction_utils import FeatureExtractionMixin
+from ...file_utils import is_speech_available, is_vision_available
+from ..wav2vec2.feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
+from .configuration_auto import replace_list_option_in_docstrings
+
+
+if is_speech_available():
+    from ..speech_to_text.feature_extraction_speech_to_text import Speech2TextFeatureExtractor
+else:
+    Speech2TextFeatureExtractor = None
+
+if is_vision_available():
+    from ..vit.feature_extraction_vit import ViTFeatureExtractor
+else:
+    ViTFeatureExtractor = None
+
+
+# Build the list of all feature extractors
+FEATURE_EXTRACTOR_MAPPING = OrderedDict(
+    [
+        ("s2t", Speech2TextFeatureExtractor),
+        ("vit", ViTFeatureExtractor),
+        ("wav2vec2", Wav2Vec2FeatureExtractor),
+    ]
+)
+
+
+def feature_extractor_class_from_name(class_name: str):
+    for c in FEATURE_EXTRACTOR_MAPPING.values():
+        if c is not None and c.__name__ == class_name:
+            return c
+
+
+class AutoFeatureExtractor:
+    r"""
+    This is a generic feature extractor class that will be instantiated as one of the feature extractor classes of the
+    library when created with the :meth:`AutoFeatureExtractor.from_pretrained` class method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoFeatureExtractor is designed to be instantiated "
+            "using the `AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+
+    @classmethod
+    @replace_list_option_in_docstrings(FEATURE_EXTRACTOR_MAPPING)
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate one of the feature extractor classes of the library from a pretrained model vocabulary.
+
+        The tokenizer class to instantiate is selected based on the :obj:`model_type` property of the config object
+        (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's
+        missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+
+        List options
+
+        Params:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+                This can be either:
+
+                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
+                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a feature extractor file saved using the
+                  :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` method, e.g.,
+                  ``./my_model_directory/``.
+                - a path or url to a saved feature extractor JSON `file`, e.g.,
+                  ``./my_model_directory/feature_extraction_config.json``.
+            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
+                standard cache should not be used.
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to force to (re-)download the feature extractor files and override the cached versions
+                if they exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (:obj:`Dict[str, str]`, `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (:obj:`str` or `bool`, `optional`):
+                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
+                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                identifier allowed by git.
+            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
+                then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+                dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
+                part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
+            kwargs (:obj:`Dict[str, Any]`, `optional`):
+                The values in kwargs of any keys which are feature extractor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
+                controlled by the ``return_unused_kwargs`` keyword parameter.
+
+        .. note::
+
+            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+
+        Examples::
+
+            >>> from transformers import AutoFeatureExtractor
+
+            >>> # Download vocabulary from huggingface.co and cache.
+            >>> feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')
+
+            >>> # If vocabulary files are in a directory (e.g. feature extractor was saved using `save_pretrained('./test/saved_model/')`)
+            >>> feature_extractor = AutoFeatureExtractor.from_pretrained('./test/saved_model/')
+
+        """
+        kwargs["_from_auto"] = True
+        config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
+
+        if "feature_extractor_type" in config_dict:
+            feature_extractor_class = feature_extractor_class_from_name(config_dict["feature_extractor_type"])
+            return feature_extractor_class.from_dict(config_dict, **kwargs)
+        else:
+            # Fallback: use pattern matching on the string.
+            for pattern, feature_extractor_class in FEATURE_EXTRACTOR_MAPPING.items():
+                if pattern in str(pretrained_model_name_or_path):
+                    return feature_extractor_class.from_dict(config_dict, **kwargs)
+
+        raise ValueError(
+            f"Unrecognized model in {pretrained_model_name_or_path}. Should have a `feature_extractor_type` key in "
+            "its feature_extraction_config.json, or contain one of the following strings "
+            f"in its name: {', '.join(FEATURE_EXTRACTOR_MAPPING.keys())}"
+        )
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index ccebed05280a54..cf01739296992e 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -124,6 +124,7 @@
 )
 from ..fsmt.modeling_fsmt import FSMTForConditionalGeneration, FSMTModel
 from ..funnel.modeling_funnel import (
+    FunnelBaseModel,
     FunnelForMaskedLM,
     FunnelForMultipleChoice,
     FunnelForPreTraining,
@@ -174,6 +175,17 @@
     MBartForSequenceClassification,
     MBartModel,
 )
+from ..megatron_bert.modeling_megatron_bert import (
+    MegatronBertForCausalLM,
+    MegatronBertForMaskedLM,
+    MegatronBertForMultipleChoice,
+    MegatronBertForNextSentencePrediction,
+    MegatronBertForPreTraining,
+    MegatronBertForQuestionAnswering,
+    MegatronBertForSequenceClassification,
+    MegatronBertForTokenClassification,
+    MegatronBertModel,
+)
 from ..mobilebert.modeling_mobilebert import (
     MobileBertForMaskedLM,
     MobileBertForMultipleChoice,
@@ -298,6 +310,7 @@
     M2M100Config,
     MarianConfig,
     MBartConfig,
+    MegatronBertConfig,
     MobileBertConfig,
     MPNetConfig,
     MT5Config,
@@ -355,6 +368,7 @@
         (BertConfig, BertModel),
         (OpenAIGPTConfig, OpenAIGPTModel),
         (GPT2Config, GPT2Model),
+        (MegatronBertConfig, MegatronBertModel),
         (MobileBertConfig, MobileBertModel),
         (TransfoXLConfig, TransfoXLModel),
         (XLNetConfig, XLNetModel),
@@ -364,7 +378,7 @@
         (CTRLConfig, CTRLModel),
         (ElectraConfig, ElectraModel),
         (ReformerConfig, ReformerModel),
-        (FunnelConfig, FunnelModel),
+        (FunnelConfig, (FunnelModel, FunnelBaseModel)),
         (LxmertConfig, LxmertModel),
         (BertGenerationConfig, BertGenerationEncoder),
         (DebertaConfig, DebertaModel),
@@ -398,6 +412,7 @@
         (BigBirdConfig, BigBirdForPreTraining),
         (OpenAIGPTConfig, OpenAIGPTLMHeadModel),
         (GPT2Config, GPT2LMHeadModel),
+        (MegatronBertConfig, MegatronBertForPreTraining),
         (MobileBertConfig, MobileBertForPreTraining),
         (TransfoXLConfig, TransfoXLLMHeadModel),
         (XLNetConfig, XLNetLMHeadModel),
@@ -441,6 +456,7 @@
         (BertConfig, BertForMaskedLM),
         (OpenAIGPTConfig, OpenAIGPTLMHeadModel),
         (GPT2Config, GPT2LMHeadModel),
+        (MegatronBertConfig, MegatronBertForMaskedLM),
         (MobileBertConfig, MobileBertForMaskedLM),
         (TransfoXLConfig, TransfoXLLMHeadModel),
         (XLNetConfig, XLNetLMHeadModel),
@@ -456,6 +472,7 @@
         (DebertaConfig, DebertaForMaskedLM),
         (DebertaV2Config, DebertaV2ForMaskedLM),
         (IBertConfig, IBertForMaskedLM),
+        (MegatronBertConfig, MegatronBertForCausalLM),
     ]
 )
 
@@ -487,6 +504,7 @@
         (MarianConfig, MarianForCausalLM),
         (BlenderbotConfig, BlenderbotForCausalLM),
         (BlenderbotSmallConfig, BlenderbotSmallForCausalLM),
+        (MegatronBertConfig, MegatronBertForCausalLM),
     ]
 )
 
@@ -514,6 +532,7 @@
         (RobertaConfig, RobertaForMaskedLM),
         (SqueezeBertConfig, SqueezeBertForMaskedLM),
         (BertConfig, BertForMaskedLM),
+        (MegatronBertConfig, MegatronBertForMaskedLM),
         (MobileBertConfig, MobileBertForMaskedLM),
         (FlaubertConfig, FlaubertWithLMHeadModel),
         (XLMConfig, XLMWithLMHeadModel),
@@ -566,6 +585,7 @@
         (LayoutLMConfig, LayoutLMForSequenceClassification),
         (BertConfig, BertForSequenceClassification),
         (XLNetConfig, XLNetForSequenceClassification),
+        (MegatronBertConfig, MegatronBertForSequenceClassification),
         (MobileBertConfig, MobileBertForSequenceClassification),
         (FlaubertConfig, FlaubertForSequenceClassification),
         (XLMConfig, XLMForSequenceClassification),
@@ -602,6 +622,7 @@
         (BertConfig, BertForQuestionAnswering),
         (XLNetConfig, XLNetForQuestionAnsweringSimple),
         (FlaubertConfig, FlaubertForQuestionAnsweringSimple),
+        (MegatronBertConfig, MegatronBertForQuestionAnswering),
         (MobileBertConfig, MobileBertForQuestionAnswering),
         (XLMConfig, XLMForQuestionAnsweringSimple),
         (ElectraConfig, ElectraForQuestionAnswering),
@@ -637,6 +658,7 @@
         (RobertaConfig, RobertaForTokenClassification),
         (SqueezeBertConfig, SqueezeBertForTokenClassification),
         (BertConfig, BertForTokenClassification),
+        (MegatronBertConfig, MegatronBertForTokenClassification),
         (MobileBertConfig, MobileBertForTokenClassification),
         (XLNetConfig, XLNetForTokenClassification),
         (AlbertConfig, AlbertForTokenClassification),
@@ -663,6 +685,7 @@
         (SqueezeBertConfig, SqueezeBertForMultipleChoice),
         (BertConfig, BertForMultipleChoice),
         (DistilBertConfig, DistilBertForMultipleChoice),
+        (MegatronBertConfig, MegatronBertForMultipleChoice),
         (MobileBertConfig, MobileBertForMultipleChoice),
         (XLNetConfig, XLNetForMultipleChoice),
         (AlbertConfig, AlbertForMultipleChoice),
@@ -677,6 +700,7 @@
 MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = OrderedDict(
     [
         (BertConfig, BertForNextSentencePrediction),
+        (MegatronBertConfig, MegatronBertForNextSentencePrediction),
         (MobileBertConfig, MobileBertForNextSentencePrediction),
     ]
 )
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 0abb08c8902cbb..2104bb644299e6 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -91,6 +91,7 @@
     TFFlaubertWithLMHeadModel,
 )
 from ..funnel.modeling_tf_funnel import (
+    TFFunnelBaseModel,
     TFFunnelForMaskedLM,
     TFFunnelForMultipleChoice,
     TFFunnelForPreTraining,
@@ -242,7 +243,7 @@
         (XLMConfig, TFXLMModel),
         (CTRLConfig, TFCTRLModel),
         (ElectraConfig, TFElectraModel),
-        (FunnelConfig, TFFunnelModel),
+        (FunnelConfig, (TFFunnelModel, TFFunnelBaseModel)),
         (DPRConfig, TFDPRQuestionEncoder),
         (MPNetConfig, TFMPNetModel),
         (BartConfig, TFBartModel),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index c4f28a43d03d6b..13089e21171c0e 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -115,6 +115,7 @@
     from ..bert_generation.tokenization_bert_generation import BertGenerationTokenizer
     from ..big_bird.tokenization_big_bird import BigBirdTokenizer
     from ..camembert.tokenization_camembert import CamembertTokenizer
+    from ..cpm.tokenization_cpm import CpmTokenizer
     from ..deberta_v2.tokenization_deberta_v2 import DebertaV2Tokenizer
     from ..m2m_100 import M2M100Tokenizer
     from ..marian.tokenization_marian import MarianTokenizer
@@ -134,6 +135,7 @@
     BertGenerationTokenizer = None
     BigBirdTokenizer = None
     CamembertTokenizer = None
+    CpmTokenizer = None
     DebertaV2Tokenizer = None
     MarianTokenizer = None
     MBartTokenizer = None
@@ -273,6 +275,7 @@
 NO_CONFIG_TOKENIZER = [
     BertJapaneseTokenizer,
     BertweetTokenizer,
+    CpmTokenizer,
     HerbertTokenizer,
     HerbertTokenizerFast,
     PhobertTokenizer,
@@ -409,7 +412,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         # if model is an encoder decoder, the encoder tokenizer class is used by default
         if isinstance(config, EncoderDecoderConfig):
             if type(config.decoder) is not type(config.encoder):  # noqa: E721
-                logger.warn(
+                logger.warning(
                     f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
                     f"config class: {config.decoder.__class}. It is not recommended to use the "
                     "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 144b61324a94a6..e5693604f8490f 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1011,7 +1011,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index 428f6fec654661..641cc80c1da866 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -180,12 +180,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py
index 1a9610c5564603..224bfb64536f96 100644
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -164,36 +164,6 @@ def build_inputs_with_special_tokens(
         sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
     def create_token_type_ids_from_sequences(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 370af8b47f472a..a1176f3a4ad3cd 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -544,7 +544,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index 8f3ecfabf6f54b..fbb2cfc02950bb 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -290,12 +290,9 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is not None:
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 57ec9345b5a4d4..6f366c7f424375 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -450,7 +450,7 @@ def __init__(self, config):
         super().__init__(config)
 
         if not config.is_decoder:
-            logger.warn("If you want to use `BertGenerationDecoder` as a standalone, add `is_decoder=True.`")
+            logger.warning("If you want to use `BertGenerationDecoder` as a standalone, add `is_decoder=True.`")
 
         self.bert = BertGenerationEncoder(config)
         self.lm_head = BertGenerationOnlyLMHead(config)
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index aaeffd73800c8e..bf110274da1ab8 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -220,12 +220,9 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index f7fd54b9468d97..5b5d96b4e9b95d 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -1586,7 +1586,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 3cafcda1890fde..4d03a0a3ac89cc 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -219,12 +219,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index abe83d018124ab..e8f6124e21481d 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -973,7 +973,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 372520bb7aa0d4..5bbedbc55f136e 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -974,7 +974,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index eb57acec890167..8337d6826cb807 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -178,12 +178,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index 648da8be701b41..a6333b98d049ad 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -162,36 +162,6 @@ def build_inputs_with_special_tokens(
         sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
     def create_token_type_ids_from_sequences(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
diff --git a/src/transformers/models/cpm/__init__.py b/src/transformers/models/cpm/__init__.py
new file mode 100644
index 00000000000000..8c687ad8fc56e9
--- /dev/null
+++ b/src/transformers/models/cpm/__init__.py
@@ -0,0 +1,48 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...file_utils import _BaseLazyModule
+
+
+_import_structure = {
+    "tokenization_cpm": ["CpmTokenizer"],
+}
+
+
+if TYPE_CHECKING:
+    from .tokenization_cpm import CpmTokenizer
+
+else:
+    import importlib
+    import os
+    import sys
+
+    class _LazyModule(_BaseLazyModule):
+        """
+        Module class that surfaces all objects but only performs associated imports when the objects are requested.
+        """
+
+        __file__ = globals()["__file__"]
+        __path__ = [os.path.dirname(__file__)]
+
+        def _get_module(self, module_name: str):
+            return importlib.import_module("." + module_name, self.__name__)
+
+    sys.modules[__name__] = _LazyModule(__name__, _import_structure)
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
new file mode 100644
index 00000000000000..447b86b1294363
--- /dev/null
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from ...utils import logging
+from ..xlnet.tokenization_xlnet import XLNetTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "TsinghuaAI/CPM-Generate": "https://huggingface.co/TsinghuaAI/CPM-Generate/resolve/main/spiece.model",
+    }
+}
+
+
+class CpmTokenizer(XLNetTokenizer):
+    """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
+
+    def __init__(self, *args, **kwargs):
+        """
+        Construct a CPM tokenizer. Based on `Jieba <https://pypi.org/project/jieba/>` and `SentencePiece
+        <https://github.com/google/sentencepiece>`__.
+
+        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+        methods. Users should refer to this superclass for more information regarding those methods.
+
+        Args:
+            vocab_file (:obj:`str`):
+                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+                contains the vocabulary necessary to instantiate a tokenizer.
+            do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether to lowercase the input when tokenizing.
+            remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+            keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to keep accents when tokenizing.
+            bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+                The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
+                token.
+
+                .. note::
+
+                    When building a sequence using special tokens, this is not the token that is used for the beginning
+                    of sequence. The token used is the :obj:`cls_token`.
+            eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+                The end of sequence token.
+
+                .. note::
+
+                    When building a sequence using special tokens, this is not the token that is used for the end of
+                    sequence. The token used is the :obj:`sep_token`.
+            unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
+                this token instead.
+            sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
+                The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+                for sequence classification or for a text and a question for question answering. It is also used as the
+                last token of a sequence built with special tokens.
+            pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+                The token used for padding, for example when batching sequences of different lengths.
+            cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
+                The classifier token which is used when doing sequence classification (classification of the whole
+                sequence instead of per-token classification). It is the first token of the sequence when built with
+                special tokens.
+            mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+                The token used for masking values. This is the token used when training this model with masked language
+                modeling. This is the token which the model will try to predict.
+            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+                Additional special tokens used by the tokenizer.
+
+        Attributes:
+            sp_model (:obj:`SentencePieceProcessor`):
+                The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        """
+        super().__init__(*args, **kwargs)
+        try:
+            import jieba
+        except ModuleNotFoundError as error:
+            raise error.__class__(
+                "You need to install jieba to use CpmTokenizer."
+                "See https://pypi.org/project/jieba/ for installation."
+            )
+        self.jieba = jieba
+        self.translator = str.maketrans(" \n", "\u2582\u2583")
+
+    def _tokenize(self, text, *args, **kwargs):
+        text = [x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)]
+        text = " ".join(text)
+        return super()._tokenize(text, *args, **kwargs)
+
+    def _decode(self, *args, **kwargs):
+        text = super()._decode(*args, **kwargs)
+        text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
+        return text
diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py
index 089c6dc509ca1d..ddd08e5286d6c2 100644
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -174,12 +174,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index a0e80f6b007a14..78509f88d774c7 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -187,16 +187,8 @@ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_spe
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(
-                map(
-                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
-                    token_ids_0,
-                )
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
             )
 
         if token_ids_1 is not None:
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 913d269ad5063c..8f77289fe53518 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -541,7 +541,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 124a9541d7e4d8..226d18cc3eab0e 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -437,16 +437,8 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(
-                map(
-                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
-                    token_ids_0,
-                )
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
             )
         # no bos used in fairseq
         if token_ids_1 is not None:
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 2a8fb28162053c..881b17b2d8760b 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -726,7 +726,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 9fb0d7475fb9d6..5808601d6b0f62 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -112,6 +112,10 @@ def load_tf_weights_in_gpt_neo(model, config, gpt_neo_checkpoint_path):
         if name[-1] == "w" and name[-2] in ["out_proj", "k_proj", "q_proj", "v_proj", "c_proj", "c_fc"]:
             array = array.transpose()
 
+        if name == ["wte"]:
+            # if vocab is padded, then trim off the padding embeddings
+            array = array[: config.vocab_size]
+
         try:
             assert (
                 pointer.shape == array.shape
@@ -130,7 +134,130 @@ def load_tf_weights_in_gpt_neo(model, config, gpt_neo_checkpoint_path):
     return model
 
 
-class GPTNeoSelfAttention(nn.Module):
+class GPTNeoAttentionMixin:
+    """
+    A few attention related utilities for attention modules in GPT Neo, to be used as a mixin.
+    """
+
+    @staticmethod
+    def _get_block_length_and_num_blocks(seq_length, window_size):
+        """
+        Computes ``block_length`` and ``num_blocks`` such that ``seq_length`` becomes evenly divisible by
+        ``block_length``.
+        """
+        block_length = window_size
+        while seq_length % block_length != 0:
+            block_length -= 1
+        num_blocks = seq_length // block_length
+        return block_length, num_blocks
+
+    @staticmethod
+    def _look_back(tensor, block_length, window_size, pad_value=0, is_key_value=True):
+        """
+        Used to implement attention between consecutive blocks. This method assumes that dim 1 of :obj:`tensor`
+        represents the :obj:`seq_length` dimention. It splits :obj:`seq_length` dimention into :obj:`num_blocks` and
+        :obj:`window_size` + :obj:`block_length`. It pads the :obj:`seq_length` dimention if necessary.
+
+        Example::
+
+            tensor: torch.tensor([[[ 0.4983], [ 2.6918], [-0.0071], [ 1.0492], [-1.8348], [ 0.7672], [ 0.2986], [ 0.0285]]])
+            with shape (1, 8, 1)
+            block_length = window_size = 4
+            _look_back =>
+            torch.tensor([[[[ 0.0000], [ 0.0000], [ 0.0000], [ 0.0000], [ 0.4983], [ 2.6918], [-0.0071], [ 1.0492]],
+                           [[ 0.4983], [ 2.6918], [-0.0071], [ 1.0492], [-1.8348], [ 0.7672], [ 0.2986], [ 0.0285]]]])
+
+        Args:
+            tensor (:obj:`torch.Tensor`): tensor of shape :obj:`[batch_size, seq_length, hidden_dim]` or :obj:`[batch_size, seq_length]`
+            block_length (:obj:`int`): An integer specifying the length of each block, used as a step size when creating the blocks.
+            window_size (:obj:`int`): An integer specifying the size of attention window, used to calculate the final block size when creating the block.
+            pad_value (obj:`int`): An integer specifying the value to use when padding the :obj:`tensor`.
+            is_key_value (:obj:`bool`): A boolean indicating if the :obj:`tensor` is a key/value tensor.
+
+        Returns:
+            tensor of shape :obj:`[batch_size, num_blocks, window_size + block_length, ...]` if :obj:`is_key_value` is
+            :obj:`True` else a tensor of shape :obj:`[batch_size, window_size + block_length, num_blocks, ...]`
+        """
+        if len(tensor.shape) == 3:
+            padding_side = (0, 0, window_size, 0)
+        elif len(tensor.shape) == 2:
+            padding_side = (window_size, 0)
+        else:
+            raise ValueError(f"Input tensor rank should be one of [2, 3], but is: {len(tensor.shape)}")
+
+        padded_tensor = F.pad(tensor, padding_side, value=pad_value)
+        padded_tensor = padded_tensor.unfold(dimension=1, size=window_size + block_length, step=block_length)
+
+        if is_key_value:
+            padded_tensor = padded_tensor.transpose(-2, -1)
+        return padded_tensor
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(*new_shape)
+        if len(tensor.shape) == 5:
+            return tensor.permute(0, 1, 3, 2, 4)  # (batch, blocks, head, block_length, head_features)
+        elif len(tensor.shape) == 4:
+            return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        if len(tensor.shape) == 5:
+            tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
+        elif len(tensor.shape) == 4:
+            tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def _split_seq_length_dim_to(self, tensors, dim_factor_1, dim_factor_2, hidden_size):
+        """
+        Splits sequence length dim of tensors into `dim_factor_1` and `dim_factor_2` dims
+        """
+        batch_size = tensors.shape[0]
+        split_dim_shape = (batch_size, dim_factor_1, dim_factor_2)
+
+        if len(tensors.shape) == 3:
+            return torch.reshape(tensors, split_dim_shape + (hidden_size,))
+        elif len(tensors.shape) == 2:
+            return torch.reshape(tensors, split_dim_shape)
+        else:
+            raise ValueError(f"Input vector rank should be one of [2, 3], but is: {len(tensors.shape)}")
+
+    def _attn(self, query, key, value, causal_mask, masked_bias, attn_dropout, attention_mask=None, head_mask=None):
+        # Keep the attention weights computation in fp32 to avoid overflow issues
+        query = query.to(torch.float32)
+        key = key.to(torch.float32)
+
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        attn_weights = torch.where(causal_mask, attn_weights, masked_bias.to(attn_weights.dtype))
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.Softmax(dim=-1)(attn_weights)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+
+class GPTNeoSelfAttention(nn.Module, GPTNeoAttentionMixin):
     def __init__(self, config):
         super().__init__()
 
@@ -149,56 +276,16 @@ def __init__(self, config):
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_heads
         self.head_dim = self.embed_dim // self.num_heads
-        assert (
-            self.head_dim * self.num_heads == self.embed_dim
-        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
 
-    def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
-        # Keep the attention weights computation in fp32 to avoid overflow issues
-        q = q.to(torch.float32)
-        k = k.to(torch.float32)
-
-        attn_weights = torch.matmul(q, k)
-        nd, ns = attn_weights.size(-2), attn_weights.size(-1)
-
-        mask = self.bias[:, :, ns - nd : ns, :ns]
-        attn_weights = torch.where(mask.bool(), attn_weights, self.masked_bias.to(attn_weights.dtype))
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.Softmax(dim=-1)(attn_weights)
-        attn_weights = attn_weights.to(v.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        outputs = (torch.matmul(attn_weights, v),)
-        if output_attentions:
-            outputs += (attn_weights,)
-        return outputs
-
-    def merge_heads(self, x):
-        x = x.permute(0, 2, 1, 3).contiguous()
-        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
-        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
-
-    def split_heads(self, x, k=False):
-        new_x_shape = x.size()[:-1] + (self.num_heads, x.size(-1) // self.num_heads)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
-        if k:
-            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
-        else:
-            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-
     def forward(
         self,
         hidden_states,
@@ -213,31 +300,40 @@ def forward(
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
 
-        query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
-        value = self.split_heads(value)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
 
         if layer_past is not None:
-            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
-            key = torch.cat((past_key, key), dim=-1)
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = torch.cat((past_key, key), dim=-2)
             value = torch.cat((past_value, value), dim=-2)
 
         if use_cache is True:
-            present = (key.transpose(-2, -1), value)  # transpose to have same shapes
+            present = (key, value)
         else:
             present = None
 
-        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions)
-        a = attn_outputs[0]
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+
+        attn_output, attn_weights = self._attn(
+            query, key, value, causal_mask, self.masked_bias, self.attn_dropout, attention_mask, head_mask
+        )
 
-        a = self.merge_heads(a)
-        a = self.out_proj(a)
-        a = self.resid_dropout(a)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
 
-        return (a, present) + attn_outputs[1:]  # a, present, (attentions)
+        return outputs  # a, present, (attentions)
 
 
-class GPTNeoLocalSelfAttention(nn.Module):
+class GPTNeoLocalSelfAttention(nn.Module, GPTNeoAttentionMixin):
     def __init__(self, config):
         super().__init__()
 
@@ -249,9 +345,10 @@ def __init__(self, config):
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_heads
         self.head_dim = self.embed_dim // self.num_heads
-        assert (
-            self.head_dim * self.num_heads == self.embed_dim
-        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
@@ -260,94 +357,39 @@ def __init__(self, config):
 
         self.window_size = config.window_size
 
-    def shift(self, x, offset, pad_value=0, dim=2):
-        t = x.shape[1]
-        dims = (len(x.shape) - dim) * (0, 0)
-        padded_x = F.pad(x, (*dims, offset, 0), value=pad_value)
-        return padded_x[:, :t, ...]
-
-    def look_around(self, x, block_length, window_size):
-        num_complete_blocks = window_size // block_length
-
-        parts = [x]
-        for i in range(1, num_complete_blocks + 1):
-            parts = [self.shift(x, i)] + parts
-
-        partial_size = window_size % block_length
-        if partial_size > 0:
-            margin = x[:, :, block_length - partial_size : block_length, ...]
-            parts = [self.shift(margin, num_complete_blocks + 1)] + parts
-        return torch.cat(parts, dim=2)
-
-    def split_heads(self, x, k=False):
-        new_x_shape = x.size()[:-1] + (self.num_heads, x.size(-1) // self.num_heads)
-        x = x.view(*new_x_shape)
-        if k:
-            return x.permute(0, 1, 3, 4, 2)  # (batch, chunks, head, head_features, seq_length)
-        else:
-            return x.permute(0, 1, 3, 2, 4)  # (batch, chunks, head, seq_length, head_features)
-
-    def merge_heads(self, x):
-        x = x.permute(0, 1, 3, 2, 4).contiguous()
-        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
-        return x.view(*new_x_shape)
+    def _create_attention_mask(self, batch_size, seq_length, num_blocks, block_length, device, attention_mask=None):
+        indices = torch.arange(seq_length, dtype=torch.long, device=device).repeat(batch_size, 1)
 
-    def _split_seq_length_dim_to(self, tensors, num_blocks, block_length):
-        return tensors.reshape(tensors.size()[0], num_blocks, block_length, -1)
+        query_indices = self._split_seq_length_dim_to(indices, num_blocks, block_length, self.embed_dim)
+        key_indices = self._look_back(indices, block_length, self.window_size, is_key_value=False)
 
-    def create_attention_mask(self, bs, seq_len, windows, block_length, attention_mask):
-        ticker = torch.arange(seq_len)[None, :]
-        b_t = ticker.reshape(1, windows, block_length)
+        # create mask tensor such that each block contains a causal_mask for that block
+        causal_mask = torch.ge(query_indices.unsqueeze(-1), key_indices.unsqueeze(-2))
 
-        bq_t = b_t
-        bq_k = self.look_around(b_t, block_length, self.window_size)
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long, device=device)
 
-        # compute attn mask
-        # this matches the original implem in mess-tensorflow
-        # https://github.com/tensorflow/mesh/blob/8bd599a21bad01cef1300a8735c17306ce35db6e/mesh_tensorflow/transformer/attention.py#L805
-        relative_position = bq_k.unsqueeze(-2) - bq_t.unsqueeze(-1)
-        relative_position = relative_position.transpose(-1, -2)
+        # A block can also be padded becuase of the _look_back operation
+        # look back into the attention_block such that it will also get padded the same way
+        # and have 0s in the padded position
+        attention_mask = self._look_back(attention_mask, block_length, self.window_size, is_key_value=False)
+        attention_mask = attention_mask.unsqueeze(-2)  # Add an extra dimention to account for hidden_dim
 
-        sequence_id = torch.ones(bs, seq_len)
-        q_seq = sequence_id.reshape(-1, windows, block_length)
-        m_seq = sequence_id.reshape(-1, windows, block_length)
-        m_seq = self.look_around(m_seq, block_length, self.window_size)
+        # Multiply the causal_mask with attention_mask so the padded positions (by _look_back operation)
+        # will contain 0s.
+        # This also makes sure that other positions ignored by the attention_mask will also be ignored
+        # in the causal_mask.
+        causal_mask = causal_mask * attention_mask
 
-        if attention_mask is not None:
-            attention_mask = attention_mask.to(m_seq.device)
-            attention_mask = attention_mask.reshape(-1, windows, block_length)
-            attention_mask = self.look_around(attention_mask, block_length, self.window_size)
-            m_seq *= attention_mask
+        # In GPT Neo's local attention each window can attend to at most window_size tokens
+        # rest of the tokens should be ignored.
+        relative_position = key_indices.unsqueeze(-2) - query_indices.unsqueeze(-1)
+        visible = torch.gt(relative_position, -self.window_size)
 
-        visible = torch.eq(q_seq.unsqueeze(-1), m_seq.unsqueeze(-2)).transpose(-1, -2)
-        visible = torch.logical_and(visible, torch.gt(relative_position, -self.window_size))
-        mask = torch.logical_and(visible, torch.less_equal(relative_position, 0)).transpose(-1, -2).unsqueeze(2)
-        return mask
+        causal_mask = causal_mask * visible
+        causal_mask = causal_mask.unsqueeze(-3).bool()  # Add an extra dimention to account for num_heads
 
-    def _attn(self, q, k, v, causal_mask, head_mask=None, output_attentions=False):
-        # attn
-
-        # Keep the attention weights computation in fp32 to avoid overflow issues
-        q = q.to(torch.float32)
-        k = k.to(torch.float32)
-
-        attn_weights = torch.matmul(q, k)
-        attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype))
-
-        attn_weights = nn.Softmax(dim=-1)(attn_weights)
-        attn_weights = attn_weights.to(v.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, v)
-
-        outputs = (attn_output,)
-        if output_attentions:
-            outputs += (attn_weights,)
-        return outputs
+        return causal_mask
 
     def forward(
         self,
@@ -371,51 +413,58 @@ def forward(
         key = self.k_proj(key_value_hidden_states)
         value = self.v_proj(key_value_hidden_states)
 
-        # compute block length and windows
-        bs, seq_len = hidden_states.shape[:2]
-        full_seq_length = seq_len + past_length
-        block_length = self.window_size
-        while full_seq_length % block_length != 0:
-            block_length -= 1
-        num_blocks = full_seq_length // block_length
+        # compute block length and num_blocks
+        batch_size, seq_length = hidden_states.shape[:2]
+        full_seq_length = seq_length + past_length
+        block_length, num_blocks = self._get_block_length_and_num_blocks(full_seq_length, self.window_size)
 
         # create buckets
         if layer_past is not None:
-            # we just need 1 window with block_length 1 when caching is enabled
-            query = self._split_seq_length_dim_to(query, 1, 1)
+            # we just need 1 block with block_length 1 when caching is enabled
+            query = self._split_seq_length_dim_to(query, 1, 1, self.embed_dim)
         else:
-            query = self._split_seq_length_dim_to(query, num_blocks, block_length)
-
-        key = self._split_seq_length_dim_to(key, num_blocks, block_length)
-        value = self._split_seq_length_dim_to(value, num_blocks, block_length)
+            query = self._split_seq_length_dim_to(query, num_blocks, block_length, self.embed_dim)
 
-        key = self.look_around(key, block_length, self.window_size)
-        value = self.look_around(value, block_length, self.window_size)
+        key = self._look_back(key, block_length, self.window_size)
+        value = self._look_back(value, block_length, self.window_size)
 
-        # select key/value vectors only for the last window
+        # select key/value vectors only for the last block
         if layer_past is not None:
             key = key[:, -1:, ...]
             value = value[:, -1:, ...]
 
-        query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
-        value = self.split_heads(value)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
 
-        mask = self.create_attention_mask(bs, full_seq_length, num_blocks, block_length, attention_mask)
+        mask = self._create_attention_mask(
+            batch_size, full_seq_length, num_blocks, block_length, hidden_states.device, attention_mask
+        )
         if layer_past is not None:
-            mask = mask[:, -1:, :, -1:, :]  # only take the mask for the last window
-        mask = mask.to(hidden_states.device)
+            mask = mask[:, -1:, :, -1:, :]  # only take the mask for the last block
 
         # attn
-        attn_outputs = self._attn(query, key, value, mask, head_mask, output_attentions)
-        attn = attn_outputs[0]
+        attn_output, attn_weights = self._attn(
+            query,
+            key,
+            value,
+            causal_mask=mask,
+            masked_bias=self.masked_bias,
+            attn_dropout=self.attn_dropout,
+            head_mask=head_mask,
+        )
 
-        attn = self.merge_heads(attn)
-        attn = attn.reshape(bs, seq_len, self.embed_dim)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = attn_output.reshape(batch_size, seq_length, self.embed_dim)
 
-        attn = self.out_proj(attn)
-        attn = self.resid_dropout(attn)
-        return (attn,) + attn_outputs[1:]
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output,)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # a, (attentions)
 
 
 class GPTNeoAttention(nn.Module):
@@ -464,7 +513,7 @@ def forward(
         return outputs
 
 
-class MLP(nn.Module):
+class GPTNeoMLP(nn.Module):
     def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * hidden_size
         super().__init__()
         embed_dim = config.hidden_size
@@ -473,13 +522,15 @@ def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 *
         self.act = ACT2FN[config.activation_function]
         self.dropout = nn.Dropout(config.resid_dropout)
 
-    def forward(self, x):
-        h = self.act(self.c_fc(x))
-        h2 = self.c_proj(h)
-        return self.dropout(h2)
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
 
 
-class Block(nn.Module):
+class GPTNeoBlock(nn.Module):
     def __init__(self, config, layer_id):
         super().__init__()
         hidden_size = config.hidden_size
@@ -487,7 +538,7 @@ def __init__(self, config, layer_id):
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.attn = GPTNeoAttention(config, layer_id)
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = MLP(inner_dim, config)
+        self.mlp = GPTNeoMLP(inner_dim, config)
 
     def forward(
         self,
@@ -498,8 +549,10 @@ def forward(
         use_cache=False,
         output_attentions=False,
     ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
         attn_outputs = self.attn(
-            self.ln_1(hidden_states),
+            hidden_states,
             layer_past=layer_past,
             attention_mask=attention_mask,
             head_mask=head_mask,
@@ -509,11 +562,13 @@ def forward(
         attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
         outputs = attn_outputs[1:]
         # residual connection
-        hidden_states = attn_output + hidden_states
+        hidden_states = attn_output + residual
 
-        feed_forward_hidden_states = self.mlp(self.ln_2(hidden_states))
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
         # residual connection
-        hidden_states = hidden_states + feed_forward_hidden_states
+        hidden_states = residual + feed_forward_hidden_states
 
         if use_cache:
             outputs = (hidden_states,) + outputs
@@ -638,7 +693,7 @@ def _init_weights(self, module):
 
 
 @add_start_docstrings(
-    "The bare GPTNeo Model transformer outputting raw hidden-states without any specific head on top.",
+    "The bare GPT Neo Model transformer outputting raw hidden-states without any specific head on top.",
     GPT_NEO_START_DOCSTRING,
 )
 class GPTNeoModel(GPTNeoPreTrainedModel):
@@ -649,7 +704,7 @@ def __init__(self, config):
         self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.drop = nn.Dropout(config.embed_dropout)
-        self.h = nn.ModuleList([Block(config, layer_id=i) for i in range(config.num_layers)])
+        self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)])
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
         self.init_weights()
@@ -768,7 +823,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py
index 2e5ba1d17ad984..7a67d5e737e36c 100644
--- a/src/transformers/models/herbert/tokenization_herbert_fast.py
+++ b/src/transformers/models/herbert/tokenization_herbert_fast.py
@@ -126,12 +126,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 3211d6a0f2aec2..bce2ddd27534db 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -470,7 +470,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 38da6e3bdc1ba1..eecfcc27f60f4a 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -2070,7 +2070,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index 2ef53d8f2b24cc..940ae65156fc6e 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -968,7 +968,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index cbd8a0aa0d8773..3d2f273d723b0d 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -207,12 +207,10 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
         prefix_ones = [1] * len(self.prefix_tokens)
         suffix_ones = [1] * len(self.suffix_tokens)
         if token_ids_1 is None:
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 0548373a0597fc..7da158680f3aec 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -981,7 +981,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 61763cc38c73f6..40be2149e6ffe7 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1020,7 +1020,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index c256132d7e73d0..a38aaf7ef3ab17 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -149,12 +149,10 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
         prefix_ones = [1] * len(self.prefix_tokens)
         suffix_ones = [1] * len(self.suffix_tokens)
         if token_ids_1 is None:
diff --git a/src/transformers/models/mbart/tokenization_mbart50.py b/src/transformers/models/mbart/tokenization_mbart50.py
index f5f1a2f60f24f0..5afd9b215f3919 100644
--- a/src/transformers/models/mbart/tokenization_mbart50.py
+++ b/src/transformers/models/mbart/tokenization_mbart50.py
@@ -241,12 +241,10 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
         prefix_ones = [1] * len(self.prefix_tokens)
         suffix_ones = [1] * len(self.suffix_tokens)
         if token_ids_1 is None:
diff --git a/src/transformers/models/mbart/tokenization_mbart50_fast.py b/src/transformers/models/mbart/tokenization_mbart50_fast.py
index bda4b7cf36d150..f22d02e59b724e 100644
--- a/src/transformers/models/mbart/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart50_fast.py
@@ -160,38 +160,6 @@ def src_lang(self, new_src_lang: str) -> None:
         self._src_lang = new_src_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index e69021831506fc..bbe9ed7d5d3d55 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -131,38 +131,6 @@ def src_lang(self, new_src_lang: str) -> None:
         self._src_lang = new_src_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
diff --git a/src/transformers/models/megatron_bert/__init__.py b/src/transformers/models/megatron_bert/__init__.py
new file mode 100644
index 00000000000000..714f1b1ecc78ad
--- /dev/null
+++ b/src/transformers/models/megatron_bert/__init__.py
@@ -0,0 +1,74 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2021  NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...file_utils import _BaseLazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
+}
+
+if is_torch_available():
+    _import_structure["modeling_megatron_bert"] = [
+        "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MegatronBertForCausalLM",
+        "MegatronBertForMaskedLM",
+        "MegatronBertForMultipleChoice",
+        "MegatronBertForNextSentencePrediction",
+        "MegatronBertForPreTraining",
+        "MegatronBertForQuestionAnswering",
+        "MegatronBertForSequenceClassification",
+        "MegatronBertForTokenClassification",
+        "MegatronBertModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
+
+    if is_torch_available():
+        from .modeling_megatron_bert import (
+            MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegatronBertForCausalLM,
+            MegatronBertForMaskedLM,
+            MegatronBertForMultipleChoice,
+            MegatronBertForNextSentencePrediction,
+            MegatronBertForPreTraining,
+            MegatronBertForQuestionAnswering,
+            MegatronBertForSequenceClassification,
+            MegatronBertForTokenClassification,
+            MegatronBertModel,
+        )
+
+else:
+    import importlib
+    import os
+    import sys
+
+    class _LazyModule(_BaseLazyModule):
+        """
+        Module class that surfaces all objects but only performs associated imports when the objects are requested.
+        """
+
+        __file__ = globals()["__file__"]
+        __path__ = [os.path.dirname(__file__)]
+
+        def _get_module(self, module_name: str):
+            return importlib.import_module("." + module_name, self.__name__)
+
+    sys.modules[__name__] = _LazyModule(__name__, _import_structure)
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
new file mode 100644
index 00000000000000..19171e70da1bc2
--- /dev/null
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2021- NVIDIA Corporation and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MEGATRON_BERT model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    # See all MEGATRON_BERT models at https://huggingface.co/models?filter=bert
+}
+
+
+class MegatronBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel`. It is
+    used to instantiate a MEGATRON_BERT model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
+    `megatron-bert-uncased-345m <https://huggingface.co/nvidia/megatron-bert-uncased-345m>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 29056):
+            Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
+            by the :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling
+            :class:`~transformers.MegatronBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+
+    Examples::
+
+        >>> from transformers import MegatronBertModel, MegatronBertConfig
+
+        >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
+        >>> configuration = MegatronBertConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = MegatronBertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "megatron-bert"
+
+    def __init__(
+        self,
+        vocab_size=29056,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        gradient_checkpointing=False,
+        position_embedding_type="absolute",
+        use_cache=True,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.gradient_checkpointing = gradient_checkpointing
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
new file mode 100644
index 00000000000000..3d7f03dcbb767c
--- /dev/null
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -0,0 +1,265 @@
+####################################################################################################
+
+# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+####################################################################################################
+
+import argparse
+import json
+import os
+import re
+import zipfile
+
+import torch
+
+
+####################################################################################################
+
+
+def recursive_print(name, val, spaces=0):
+    # Format the message.
+    if name is None:
+        msg = None
+    else:
+        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
+        msg = fmt.format(name)
+
+    # Print and recurse (if needed).
+    if isinstance(val, dict):
+        if msg is not None:
+            print(msg)
+        for k in val.keys():
+            recursive_print(k, val[k], spaces + 2)
+    elif isinstance(val, torch.Tensor):
+        print(msg, ":", val.size())
+    else:
+        print(msg, ":", val)
+
+
+####################################################################################################
+
+
+def convert_megatron_checkpoint(args, input_state_dict):
+    # The converted output model.
+    output_state_dict = {}
+
+    # The model.
+    model = input_state_dict["model"]
+    # The language model.
+    lm = model["language_model"]
+    # The embeddings.
+    embeddings = lm["embedding"]
+
+    # The word embeddings.
+    word_embeddings = embeddings["word_embeddings"]["weight"]
+    # Store the word embeddings.
+    output_state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings
+
+    # The position embeddings.
+    pos_embeddings = embeddings["position_embeddings"]["weight"]
+    # Trained for 512 x 1024.
+    assert pos_embeddings.size(0) == 512 and pos_embeddings.size(1) == 1024
+    # Store the position embeddings.
+    output_state_dict["bert.embeddings.position_embeddings.weight"] = pos_embeddings
+
+    # The token-type embeddings.
+    tokentype_embeddings = embeddings["tokentype_embeddings"]["weight"]
+    # Store the position embeddings.
+    output_state_dict["bert.embeddings.token_type_embeddings.weight"] = tokentype_embeddings
+
+    # The transformer.
+    transformer = lm["transformer"]
+
+    # The regex to extract layer names.
+    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
+
+    # The simple map of names for "automated" rules.
+    megatron_to_transformers = {
+        "attention.dense": ".attention.output.dense.",
+        "mlp.dense_h_to_4h": ".intermediate.dense.",
+        "mlp.dense_4h_to_h": ".output.dense.",
+    }
+
+    # Keep track of the attention/query/value tensor.
+    attention_qkv_weight = None
+
+    # Extract the layers.
+    for key, val in transformer.items():
+        # Match the name.
+        m = layer_re.match(key)
+
+        # Stop if that's not a layer
+        if m is None:
+            break
+
+        # The index of the layer.
+        layer_idx = int(m.group(1))
+        # The name of the operation.
+        op_name = m.group(2)
+        # Is it a weight or a bias?
+        weight_or_bias = m.group(3)
+
+        # The name of the layer.
+        layer_name = f"bert.encoder.layer.{layer_idx}"
+
+        # For layernorm(s), simply store the layer norm.
+        if op_name.endswith("layernorm"):
+
+            ln_name = "attention.ln" if op_name.startswith("input") else "ln"
+            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
+
+        # Transpose the QKV matrix.
+        elif op_name == "attention.query_key_value" and weight_or_bias == "weight":
+
+            # Make sure the QKV pointer is nil.
+            assert attention_qkv_weight is None, ""
+
+            # Store the tensor as we need the bias as well to interleave QKV and biases.
+            attention_qkv_weight = val
+
+        # Transpose the bias.
+        elif op_name == "attention.query_key_value" and weight_or_bias == "bias":
+
+            # Make sure we read the weight tensor.
+            assert attention_qkv_weight is not None, ""
+
+            # Split the QKV matrix into Q, K and V. Megatron stores Q,K,V interleaved.
+            q = attention_qkv_weight[0 * 1024 : 1 * 1024, :]
+            k = attention_qkv_weight[1 * 1024 : 2 * 1024, :]
+            v = attention_qkv_weight[2 * 1024 : 3 * 1024, :]
+
+            # Split the bias.
+            q_bias = val[0 * 1024 : 1 * 1024]
+            k_bias = val[1 * 1024 : 2 * 1024]
+            v_bias = val[2 * 1024 : 3 * 1024]
+
+            # Store.
+            output_state_dict[f"{layer_name}.attention.self.query.weight"] = q
+            output_state_dict[f"{layer_name}.attention.self.query.bias"] = q_bias
+            output_state_dict[f"{layer_name}.attention.self.key.weight"] = k
+            output_state_dict[f"{layer_name}.attention.self.key.bias"] = k_bias
+            output_state_dict[f"{layer_name}.attention.self.value.weight"] = v
+            output_state_dict[f"{layer_name}.attention.self.value.bias"] = v_bias
+
+            # Clear the stored tensor.
+            attention_qkv_weight = None
+
+        # Copy weights and biases as is.
+        elif weight_or_bias in ["weight", "bias"]:
+
+            out_name = megatron_to_transformers[op_name]
+            output_state_dict[layer_name + out_name + weight_or_bias] = val
+
+    # The final layernorm.
+    output_state_dict["bert.encoder.ln.weight"] = transformer["final_layernorm.weight"]
+    output_state_dict["bert.encoder.ln.bias"] = transformer["final_layernorm.bias"]
+
+    # The config.
+    output_config = {
+        "vocab_size": word_embeddings.size(0),
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "hidden_act": "gelu_new",
+        "intermediate_size": 4096,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.2,
+        "layer_norm_eps": 1e-12,
+        "gradient_checkpointing": False,
+        "position_embedding_type": "absolute",
+        "use_cache": False,
+    }
+
+    # The pooler.
+    pooler = lm["pooler"]
+
+    # Store the matrix and the bias.
+    output_state_dict["bert.pooler.dense.weight"] = pooler["dense.weight"]
+    output_state_dict["bert.pooler.dense.bias"] = pooler["dense.bias"]
+
+    # The LM head from Megatron (for RACE).
+    lm_head = model["lm_head"]
+
+    # The transform matrix.
+    output_state_dict["cls.predictions.transform.dense.weight"] = lm_head["dense.weight"]
+    output_state_dict["cls.predictions.transform.dense.bias"] = lm_head["dense.bias"]
+
+    # The transform LN.
+    output_state_dict["cls.predictions.transform.LayerNorm.weight"] = lm_head["layernorm.weight"]
+    output_state_dict["cls.predictions.transform.LayerNorm.bias"] = lm_head["layernorm.bias"]
+
+    # For the decoder, we replicate the weights.
+    output_state_dict["cls.predictions.decoder.weight"] = word_embeddings
+    output_state_dict["cls.predictions.bias"] = lm_head["bias"]
+
+    # The classifier from Megatron (for MLNI).
+    binary_head = model["binary_head"]
+
+    # Store the classifier.
+    output_state_dict["cls.seq_relationship.weight"] = binary_head["weight"]
+    output_state_dict["cls.seq_relationship.bias"] = binary_head["bias"]
+
+    # It should be done!
+    return output_state_dict, output_config
+
+
+####################################################################################################
+
+
+def main():
+    # Create the argument parser.
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--print-checkpoint-structure", action="store_true")
+    parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint")
+    args = parser.parse_args()
+
+    # Extract the basename.
+    basename = os.path.dirname(args.path_to_checkpoint)
+
+    # Load the model.
+    print(f'Extracting PyTorch state dictionary from "{args.path_to_checkpoint}"')
+    with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
+        with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
+            input_state_dict = torch.load(pytorch_dict, map_location="cpu")
+
+    # Convert.
+    print("Converting")
+    output_state_dict, output_config = convert_megatron_checkpoint(args, input_state_dict)
+
+    # Print the structure of converted state dict.
+    if args.print_checkpoint_structure:
+        recursive_print(None, output_state_dict)
+
+    # Store the config to file.
+    output_config_file = os.path.join(basename, "config.json")
+    print(f'Saving config to "{output_config_file}"')
+    with open(output_config_file, "w") as f:
+        json.dump(output_config, f)
+
+    # Store the state_dict to file.
+    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
+    print(f'Saving checkpoint to "{output_checkpoint_file}"')
+    torch.save(output_state_dict, output_checkpoint_file)
+
+
+####################################################################################################
+
+if __name__ == "__main__":
+    main()
+
+####################################################################################################
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
new file mode 100755
index 00000000000000..ce4ece3d32fb98
--- /dev/null
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -0,0 +1,1827 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch MegatronBERT model. """
+
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ...utils import logging
+from .configuration_megatron_bert import MegatronBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MegatronBertConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+_CHECKPOINT_FOR_DOC = "nvidia/megatron-bert-cased-345m"
+
+MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "nvidia/megatron-bert-cased-345m",
+    # See all MegatronBERT models at https://huggingface.co/models?filter=megatron_bert
+]
+
+
+def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class MegatronBertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+
+        # In Megatron, layer-norm is applied after the 1st dropout.
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        # Megatron BERT moves that layer norm after the drop-out (and to each layer).
+        # embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->MegatronBert
+class MegatronBertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Based transformers.models.bert.modeling_bert.BertSelfOutput. Moved LayerNorm to MegatronBertAttention below.
+class MegatronBertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, residual):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return residual + hidden_states
+
+
+# Based transformers.models.bert.modeling_bert.BertAttention. Added LayerNorm.
+class MegatronBertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.self = MegatronBertSelfAttention(config)
+        self.output = MegatronBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        ln_outputs = self.ln(hidden_states)
+        self_outputs = self.self(
+            ln_outputs,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->MegatronBert
+class MegatronBertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertOutput. Moved LayerNorm to MegatronBertLayer below.
+class MegatronBertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return input_tensor + hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertLayer. Added LayerNorm.
+class MegatronBertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = MegatronBertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = MegatronBertAttention(config)
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.intermediate = MegatronBertIntermediate(config)
+        self.output = MegatronBertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        ln_output = self.ln(attention_output)
+        intermediate_output = self.intermediate(ln_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class MegatronBertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([MegatronBertLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # The final layer norm. We removed the 1st LN, moved LN to each hidden layer and this one
+        # is simply the final LN (Transformer's BERT has it attached to each hidden layer).
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            # Because we moved the layer-norm at the end of the hidden layer, we have non-normali-
+            # zed data here. If that's really needed, we must apply LN to match Transformer's BERT.
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        # Finalize the hidden states.
+        hidden_states = self.ln(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->MegatronBert
+class MegatronBertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->MegatronBert
+class MegatronBertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->MegatronBert
+class MegatronBertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = MegatronBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->MegatronBert
+class MegatronBertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MegatronBertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->MegatronBert
+class MegatronBertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->MegatronBert
+class MegatronBertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MegatronBertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class MegatronBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MegatronBertConfig
+    load_tf_weights = load_tf_weights_in_megatron_bert
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+@dataclass
+# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->MegatronBert
+class MegatronBertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.MegatronBertForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+MEGATRON_BERT_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.MegatronBertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+MEGATRON_BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MegatronBert Model transformer outputting raw hidden-states without any specific head on top.",
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertModel(MegatronBertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = MegatronBertEmbeddings(config)
+        self.encoder = MegatronBertEncoder(config)
+
+        self.pooler = MegatronBertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
+    `next sentence prediction (classification)` head.
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
+    def __init__(self, config, add_binary_head=True):
+        super().__init__(config)
+
+        self.bert = MegatronBertModel(config)
+        self.cls = MegatronBertPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=MegatronBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, MegatronBertForPreTraining
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert-cased-345m')
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return MegatronBertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MegatronBert Model with a `language modeling` head on top for CLM fine-tuning. """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `MegatronBertForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.cls = MegatronBertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, MegatronBertForCausalLM, MegatronBertConfig
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert-cased-345m', is_decoder=True)
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings("""MegatronBert Model with a `language modeling` head on top. """, MEGATRON_BERT_START_DOCSTRING)
+class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `MegatronBertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.cls = MegatronBertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """MegatronBert Model with a `next sentence prediction (classification)` head on top. """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"predictions"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = MegatronBertModel(config)
+        self.cls = MegatronBertOnlyNSPHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, MegatronBertForNextSentencePrediction
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert-cased-345m')
+
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+
+            >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = MegatronBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output
+    and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = MegatronBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
new file mode 100644
index 00000000000000..2d2d54b8123a99
--- /dev/null
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -0,0 +1,238 @@
+####################################################################################################
+
+# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+####################################################################################################
+
+import argparse
+import json
+import os
+import re
+import zipfile
+
+import torch
+
+
+####################################################################################################
+
+
+def recursive_print(name, val, spaces=0):
+    # Format the message.
+    if name is None:
+        msg = None
+    else:
+        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
+        msg = fmt.format(name)
+
+    # Print and recurse (if needed).
+    if isinstance(val, dict):
+        if msg is not None:
+            print(msg)
+        for k in val.keys():
+            recursive_print(k, val[k], spaces + 2)
+    elif isinstance(val, torch.Tensor):
+        print(msg, ":", val.size())
+    else:
+        print(msg, ":", val)
+
+
+####################################################################################################
+
+
+def convert_megatron_checkpoint(args, input_state_dict):
+    # The converted output model.
+    output_state_dict = {}
+
+    # The number of heads.
+    heads = 16
+    # The hidden_size per head.
+    hidden_size_per_head = 64
+
+    # The model.
+    model = input_state_dict["model"]
+    # The language model.
+    lm = model["language_model"]
+    # The embeddings.
+    embeddings = lm["embedding"]
+
+    # The word embeddings.
+    word_embeddings = embeddings["word_embeddings"]["weight"]
+    # Truncate the embedding table to 50257 rows.
+    word_embeddings = word_embeddings[:50257, :]
+    # Truncate the embedding table to 50257 rows.
+    output_state_dict["transformer.wte.weight"] = word_embeddings
+
+    # The position embeddings.
+    pos_embeddings = embeddings["position_embeddings"]["weight"]
+    # Read the hidden dimension.
+    hidden_size = pos_embeddings.size(0)
+    # DEBUG.
+    assert hidden_size == heads * hidden_size_per_head
+    # Store the position embeddings.
+    output_state_dict["transformer.wpe.weight"] = pos_embeddings
+
+    # The transformer.
+    transformer = lm["transformer"]
+
+    # The regex to extract layer names.
+    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
+
+    # The simple map of names for "automated" rules.
+    megatron_to_transformers = {
+        "attention.dense": ".attn.c_proj.",
+        "mlp.dense_h_to_4h": ".mlp.c_fc.",
+        "mlp.dense_4h_to_h": ".mlp.c_proj.",
+    }
+
+    # Extract the layers.
+    for key, val in transformer.items():
+        # Match the name.
+        m = layer_re.match(key)
+
+        # Stop if that's not a layer
+        if m is None:
+            break
+
+        # The index of the layer.
+        layer_idx = int(m.group(1))
+        # The name of the operation.
+        op_name = m.group(2)
+        # Is it a weight or a bias?
+        weight_or_bias = m.group(3)
+
+        # The name of the layer.
+        layer_name = f"transformer.h.{layer_idx}"
+
+        # For layernorm(s), simply store the layer norm.
+        if op_name.endswith("layernorm"):
+
+            ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
+            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
+
+        # Transpose the QKV matrix.
+        elif op_name == "attention.query_key_value" and weight_or_bias == "weight":
+
+            # Insert a tensor of 1x1xDxD bias.
+            zeros = torch.ones(1, 1, hidden_size, hidden_size)
+            output_state_dict[layer_name + ".attn.bias"] = zeros
+
+            # Insert a "dummy" tensor for masked_bias.
+            masked_bias = torch.tensor(-1e4)
+            output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
+
+            # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
+            out_val = val.transpose(0, 1)
+            # Store.
+            output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
+
+        # Transpose the bias.
+        elif op_name == "attention.query_key_value" and weight_or_bias == "bias":
+
+            # Store. No change of shape.
+            output_state_dict[layer_name + ".attn.c_attn.bias"] = val
+
+        # Transpose the weights.
+        elif weight_or_bias == "weight":
+
+            out_name = megatron_to_transformers[op_name]
+            output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
+
+        # Copy the bias.
+        elif weight_or_bias == "bias":
+
+            out_name = megatron_to_transformers[op_name]
+            output_state_dict[layer_name + out_name + "bias"] = val
+
+    # The final layernorm.
+    output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
+    output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]
+
+    # For LM head, transformers' wants the matrix to weight embeddings.
+    output_state_dict["lm_head.weight"] = word_embeddings
+
+    # The config.
+    output_config = {
+        "activation_function": "gelu_new",
+        "architectures": ["GPT2LMHeadModel"],
+        "attn_pdrop": 0.1,
+        "bos_token_id": 50256,
+        "embd_pdrop": 0.1,
+        "eos_token_id": 50256,
+        "initializer_range": 0.02,
+        "layer_norm_epsilon": 1e-05,
+        "model_type": "gpt2",
+        "n_ctx": 1024,
+        "n_embd": 1024,
+        "n_head": 16,
+        "n_layer": 24,
+        "n_positions": 1024,
+        "resid_pdrop": 0.1,
+        "summary_activation": None,
+        "summary_first_dropout": 0.1,
+        "summary_proj_to_labels": True,
+        "summary_type": "cls_index",
+        "summary_use_proj": True,
+        "vocab_size": 50257,
+    }
+
+    # It should be done!
+    return output_state_dict, output_config
+
+
+####################################################################################################
+
+
+def main():
+    # Create the argument parser.
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--print-checkpoint-structure", action="store_true")
+    parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint")
+    args = parser.parse_args()
+
+    # Extract the basename.
+    basename = os.path.dirname(args.path_to_checkpoint)
+
+    # Load the model.
+    print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
+    with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
+        with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
+            input_state_dict = torch.load(pytorch_dict, map_location="cpu")
+
+    # Convert.
+    print("Converting")
+    output_state_dict, output_config = convert_megatron_checkpoint(args, input_state_dict)
+
+    # Print the structure of converted state dict.
+    if args.print_checkpoint_structure:
+        recursive_print(None, output_state_dict)
+
+    # Store the config to file.
+    output_config_file = os.path.join(basename, "config.json")
+    print(f'Saving config to "{output_config_file}"')
+    with open(output_config_file, "w") as f:
+        json.dump(output_config, f)
+
+    # Store the state_dict to file.
+    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
+    print(f'Saving checkpoint to "{output_checkpoint_file}"')
+    torch.save(output_state_dict, output_checkpoint_file)
+
+
+####################################################################################################
+
+if __name__ == "__main__":
+    main()
+
+####################################################################################################
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index 125fde68a5bf96..8041ec4ec5f77f 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -266,12 +266,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 5cbbd31080ef97..c46582f70bcbcf 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -987,7 +987,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py
index e99e58002e8880..3caca9012d238c 100644
--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -201,12 +201,9 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 03aac1bd899819..3b369c3373dac0 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -1475,7 +1475,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index cd51662b5599e9..25df78162e6272 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -203,12 +203,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index dd1ddc03d7dcc4..8e2676298716aa 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -21,14 +21,7 @@
 
 import numpy as np
 
-from ...file_utils import (
-    cached_path,
-    is_datasets_available,
-    is_faiss_available,
-    is_remote_url,
-    requires_datasets,
-    requires_faiss,
-)
+from ...file_utils import cached_path, is_datasets_available, is_faiss_available, is_remote_url, requires_backends
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
 from .configuration_rag import RagConfig
@@ -372,8 +365,7 @@ class RagRetriever:
 
     def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, index=None, init_retrieval=True):
         self._init_retrieval = init_retrieval
-        requires_datasets(self)
-        requires_faiss(self)
+        requires_backends(self, ["datasets", "faiss"])
         super().__init__()
         self.index = index or self._build_index(config)
         self.generator_tokenizer = generator_tokenizer
@@ -411,8 +403,7 @@ def _build_index(config):
 
     @classmethod
     def from_pretrained(cls, retriever_name_or_path, indexed_dataset=None, **kwargs):
-        requires_datasets(cls)
-        requires_faiss(cls)
+        requires_backends(cls, ["datasets", "faiss"])
         config = kwargs.pop("config", None) or RagConfig.from_pretrained(retriever_name_or_path, **kwargs)
         rag_tokenizer = RagTokenizer.from_pretrained(retriever_name_or_path, config=config)
         question_encoder_tokenizer = rag_tokenizer.question_encoder
diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py
index 08d12dc45e82e5..93501fca7092e1 100755
--- a/src/transformers/models/reformer/configuration_reformer.py
+++ b/src/transformers/models/reformer/configuration_reformer.py
@@ -44,7 +44,7 @@ class ReformerConfig(PretrainedConfig):
 
             For more information on LSHSelfAttention layer, see `LSH Self Attention
             <reformer.html#lsh-self-attention>`__. For more information on LocalSelfAttention layer, see `Local Self
-            Attention <reformer.html#local-sensitive-hashing-self-attention>`__.
+            Attention <reformer.html#local-self-attention>`__.
         axial_pos_embds (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to use axial position embeddings. For more information on how axial position embeddings
             work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 88155f76de29f2..f7a73b336c79e7 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -484,7 +484,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py
index 9a037d1d1551a1..696868fdfc30c3 100644
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -215,12 +215,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/speech_to_text/__init__.py b/src/transformers/models/speech_to_text/__init__.py
index 0defd14c0032c7..026312e8cdab25 100644
--- a/src/transformers/models/speech_to_text/__init__.py
+++ b/src/transformers/models/speech_to_text/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_torch_available
+from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_speech_available, is_torch_available
 
 
 _import_structure = {
@@ -25,13 +25,17 @@
         "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Speech2TextConfig",
     ],
-    "feature_extraction_speech_to_text": ["Speech2TextFeatureExtractor"],
 }
 
 if is_sentencepiece_available():
-    _import_structure["processing_speech_to_text"] = ["Speech2TextProcessor"]
     _import_structure["tokenization_speech_to_text"] = ["Speech2TextTokenizer"]
 
+if is_speech_available():
+    _import_structure["feature_extraction_speech_to_text"] = ["Speech2TextFeatureExtractor"]
+
+    if is_sentencepiece_available():
+        _import_structure["processing_speech_to_text"] = ["Speech2TextProcessor"]
+
 if is_torch_available():
     _import_structure["modeling_speech_to_text"] = [
         "SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -43,12 +47,16 @@
 
 if TYPE_CHECKING:
     from .configuration_speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
-    from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
 
     if is_sentencepiece_available():
-        from .processing_speech_to_text import Speech2TextProcessor
         from .tokenization_speech_to_text import Speech2TextTokenizer
 
+    if is_speech_available():
+        from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
+
+        if is_sentencepiece_available():
+            from .processing_speech_to_text import Speech2TextProcessor
+
     if is_torch_available():
         from .modeling_speech_to_text import (
             SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index e7fdb44aefe40b..a7c21a969f9c0b 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -19,19 +19,15 @@
 from typing import List, Optional, Union
 
 import numpy as np
+import torch
+import torchaudio.compliance.kaldi as ta_kaldi
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...file_utils import PaddingStrategy, TensorType, is_torch_available, is_torchaudio_available
+from ...file_utils import PaddingStrategy, TensorType
 from ...utils import logging
 
 
-if is_torch_available():
-    import torch
-
-if is_torchaudio_available():
-    import torchaudio.compliance.kaldi as ta_kaldi
-
 logger = logging.get_logger(__name__)
 
 
@@ -75,8 +71,6 @@ def __init__(
         normalize_vars=True,
         **kwargs
     ):
-        if not is_torchaudio_available():
-            raise ImportError("`Speech2TextFeatureExtractor` requires torchaudio: `pip install torchaudio`.")
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.num_mel_bins = num_mel_bins
         self.do_ceptral_normalize = do_ceptral_normalize
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 1c3c6f00110fd3..6afb3f6791ede9 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -1015,7 +1015,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache = True` is incompatible with `config.gradient_checkpointing = True`. Setting `use_cache = False`..."
                     )
                     use_cache = False
diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
index bf3402295aa337..502021d535793e 100644
--- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
@@ -199,12 +199,10 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.bos_token_id, self.eos_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
         prefix_ones = [1] * len(self.prefix_tokens)
         suffix_ones = [1]
         if token_ids_1 is None:
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 74dc811c6e4561..90a0159aefb7d2 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -157,12 +157,10 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
         # normal case: some special tokens
         if token_ids_1 is None:
             return ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index fd1d08145c7bd6..5bfca58596c435 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -33,7 +33,7 @@
     add_start_docstrings_to_model_forward,
     is_scatter_available,
     replace_return_docstrings,
-    requires_scatter,
+    requires_backends,
 )
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
 from ...modeling_utils import (
@@ -792,7 +792,7 @@ class TapasModel(TapasPreTrainedModel):
     """
 
     def __init__(self, config, add_pooling_layer=True):
-        requires_scatter(self)
+        requires_backends(self, "scatter")
         super().__init__(config)
         self.config = config
 
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 9716193951f9b0..3d1e82ac518085 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -510,12 +510,9 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is not None:
             return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 99bd60c463ede2..b7d20ec7859c28 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 Google AI, Ross Weightman, The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
index d386d8b7bfb915..02be2b8ec73471 100644
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
@@ -111,7 +111,7 @@ def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
         if not is_used:
             unused_weights.append(name)
 
-    logger.warn(f"Unused weights: {unused_weights}")
+    logger.warning(f"Unused weights: {unused_weights}")
 
 
 def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index f2989ffa56c3b5..6bac6f597c3f2f 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -1140,7 +1140,7 @@ def call(
         )
 
         if inputs["lengths"] is not None:
-            logger.warn(
+            logger.warning(
                 "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
                 "attention mask instead.",
             )
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index 3ccd63ee9781ed..a4a6c0dd08a33f 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -1232,7 +1232,7 @@ def forward(
         )
 
         if lengths is not None:
-            logger.warn(
+            logger.warning(
                 "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
                 "attention mask instead."
             )
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index d861ccc0ed04fe..95730451fddd12 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -906,16 +906,8 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(
-                map(
-                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
-                    token_ids_0,
-                )
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
             )
 
         if token_ids_1 is not None:
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
index ba1d160ee29815..188292ed177527 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -200,12 +200,9 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 4549d212ecf89a..8ecec6dffe0bb6 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -206,12 +206,9 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index 9426d6c4aa1adb..fbdeca2e1a24b6 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -172,37 +172,6 @@ def build_inputs_with_special_tokens(
         sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
     def create_token_type_ids_from_sequences(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index 4980f450cba75c..97fd542c2812ae 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -270,12 +270,9 @@ def get_special_tokens_mask(
         """
 
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is not None:
             return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
index f3a46c2d785b81..364dccf3d6aa8a 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -190,37 +190,6 @@ def build_inputs_with_special_tokens(
             return token_ids_0 + sep + cls
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
-        return ([0] * len(token_ids_0)) + [1, 1]
-
     def create_token_type_ids_from_sequences(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 2455f47c09fb5a..fb1b959d4686da 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -246,6 +246,7 @@ def pipeline(
     framework: Optional[str] = None,
     revision: Optional[str] = None,
     use_fast: bool = True,
+    use_auth_token: Optional[Union[str, bool]] = None,
     model_kwargs: Dict[str, Any] = {},
     **kwargs
 ) -> Pipeline:
@@ -308,6 +309,10 @@ def pipeline(
             artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git.
         use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`).
+        use_auth_token (:obj:`str` or `bool`, `optional`):
+            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
+            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
         model_kwargs:
             Additional dictionary of keyword arguments passed along to the model's :obj:`from_pretrained(...,
             **model_kwargs)` function.
@@ -367,6 +372,9 @@ def pipeline(
 
     task_class, model_class = targeted_task["impl"], targeted_task[framework]
 
+    # Retrieve use_auth_token and add it to model_kwargs to be used in .from_pretrained
+    model_kwargs["use_auth_token"] = model_kwargs.get("use_auth_token", use_auth_token)
+
     # Instantiate tokenizer if needed
     if isinstance(tokenizer, (str, tuple)):
         if isinstance(tokenizer, tuple):
@@ -377,12 +385,12 @@ def pipeline(
             )
         else:
             tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer, revision=revision, use_fast=use_fast, _from_pipeline=task
+                tokenizer, revision=revision, use_fast=use_fast, _from_pipeline=task, **model_kwargs
             )
 
     # Instantiate config if needed
     if isinstance(config, str):
-        config = AutoConfig.from_pretrained(config, revision=revision, _from_pipeline=task)
+        config = AutoConfig.from_pretrained(config, revision=revision, _from_pipeline=task, **model_kwargs)
 
     # Instantiate modelcard if needed
     if isinstance(modelcard, str):
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 9da13796f58e47..d06376aa43c077 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -48,7 +48,7 @@
 
 
 def infer_framework_from_model(
-    model, model_classes: Optional[Dict[str, type]] = None, revision: Optional[str] = None, task: Optional[str] = None
+    model, model_classes: Optional[Dict[str, type]] = None, task: Optional[str] = None, **model_kwargs
 ):
     """
     Select framework (TensorFlow or PyTorch) to use from the :obj:`model` passed. Returns a tuple (framework, model).
@@ -65,10 +65,11 @@ def infer_framework_from_model(
             from.
         model_classes (dictionary :obj:`str` to :obj:`type`, `optional`):
             A mapping framework to class.
-        revision (:obj:`str`, `optional`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
-            identifier allowed by git.
+        task (:obj:`str`):
+            The task defining which pipeline will be returned.
+        model_kwargs:
+            Additional dictionary of keyword arguments passed along to the model's :obj:`from_pretrained(...,
+            **model_kwargs)` function.
 
     Returns:
         :obj:`Tuple`: A tuple framework, model.
@@ -80,19 +81,20 @@ def infer_framework_from_model(
             "To install PyTorch, read the instructions at https://pytorch.org/."
         )
     if isinstance(model, str):
+        model_kwargs["_from_pipeline"] = task
         if is_torch_available() and not is_tf_available():
             model_class = model_classes.get("pt", AutoModel)
-            model = model_class.from_pretrained(model, revision=revision, _from_pipeline=task)
+            model = model_class.from_pretrained(model, **model_kwargs)
         elif is_tf_available() and not is_torch_available():
             model_class = model_classes.get("tf", TFAutoModel)
-            model = model_class.from_pretrained(model, revision=revision, _from_pipeline=task)
+            model = model_class.from_pretrained(model, **model_kwargs)
         else:
             try:
                 model_class = model_classes.get("pt", AutoModel)
-                model = model_class.from_pretrained(model, revision=revision, _from_pipeline=task)
+                model = model_class.from_pretrained(model, **model_kwargs)
             except OSError:
                 model_class = model_classes.get("tf", TFAutoModel)
-                model = model_class.from_pretrained(model, revision=revision, _from_pipeline=task)
+                model = model_class.from_pretrained(model, **model_kwargs)
 
     framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
     return framework, model
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index a846e0d939478a..9ab07b10e81d71 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from ..file_utils import add_end_docstrings, is_torch_available, requires_pandas
+from ..file_utils import add_end_docstrings, is_torch_available, requires_backends
 from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException
 
 
@@ -24,7 +24,7 @@ def __call__(self, table=None, query=None, sequential=False, padding=True, trunc
         #   ...,
         #   {"table": pd.DataFrame, "query" : List[str]}
         # ]
-        requires_pandas(self)
+        requires_backends(self, "pandas")
         import pandas as pd
 
         if table is None:
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index 24e99072b6f088..dd66fb95877ff4 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -142,7 +142,7 @@ def __call__(
         """
         if "multi_class" in kwargs and kwargs["multi_class"] is not None:
             multi_label = kwargs.pop("multi_class")
-            logger.warn(
+            logger.warning(
                 "The `multi_class` argument has been deprecated and renamed to `multi_label`. "
                 "`multi_class` will be removed in a future version of Transformers."
             )
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 3f1273a7c9d776..a5c4e7d2b8ab25 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -24,6 +24,7 @@
 from distutils.util import strtobool
 from io import StringIO
 from pathlib import Path
+from typing import Iterator, Union
 
 from .file_utils import (
     is_datasets_available,
@@ -621,6 +622,27 @@ def __repr__(self):
         return f"captured: {self.out}\n"
 
 
+@contextlib.contextmanager
+# adapted from https://stackoverflow.com/a/64789046/9201239
+def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:
+    """
+    Temporary add given path to `sys.path`.
+
+    Usage ::
+
+       with ExtendSysPath('/path/to/dir'):
+           mymodule = importlib.import_module('mymodule')
+
+    """
+
+    path = os.fspath(path)
+    try:
+        sys.path.insert(0, path)
+        yield
+    finally:
+        sys.path.remove(path)
+
+
 class TestCasePlus(unittest.TestCase):
     """
     This class extends `unittest.TestCase` with additional features.
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 0606c3f9c1dddc..fafe8a5597b67d 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -670,6 +670,16 @@ def get_special_tokens_mask(
         Returns:
             A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
         return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
     @overload
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 6ccf3f48f7444d..7b68164b914467 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -825,7 +825,13 @@ def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToke
         special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
         current vocabulary).
 
-        Using : obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
+        .. Note::
+            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
+            the model so that its embedding matrix matches the tokenizer.
+
+            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.
+
+        Using :obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
 
         - Special tokens are carefully handled by the tokenizer (they are never split).
         - You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 7c33981b6d98f8..41800b7fd3a32c 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -17,7 +17,6 @@
 """
 
 import collections
-import gc
 import inspect
 import math
 import os
@@ -41,7 +40,8 @@
     is_ray_tune_available,
     run_hp_search_optuna,
     run_hp_search_ray,
-    init_deepspeed,
+    deepspeed_init,
+    is_deepspeed_zero3_enabled,
 )
 
 import numpy as np
@@ -54,6 +54,7 @@
 from torch.utils.data.sampler import RandomSampler, SequentialSampler
 
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
+from .dependency_versions_check import dep_version_check
 from .file_utils import (
     WEIGHTS_NAME,
     is_apex_available,
@@ -139,17 +140,14 @@
     import torch_xla.distributed.parallel_loader as pl
 
 if is_fairscale_available():
+    dep_version_check("fairscale")
     import fairscale
+    from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
     from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
+    from fairscale.nn.wrap import auto_wrap
     from fairscale.optim import OSS
     from fairscale.optim.grad_scaler import ShardedGradScaler
 
-    if version.parse(fairscale.__version__) >= version.parse("0.3"):
-        from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
-        from fairscale.nn.wrap import auto_wrap
-    else:
-        FullyShardedDDP = None
-
 if is_sagemaker_dp_enabled():
     import smdistributed.dataparallel.torch.distributed as dist
     from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP
@@ -921,7 +919,7 @@ def train(
             logger.info(f"Loading model from {resume_from_checkpoint}).")
 
             if self.deepspeed:
-                # will be resumed in init_deepspeed
+                # will be resumed in deepspeed_init
                 pass
             elif isinstance(self.model, PreTrainedModel):
                 self.model = self.model.from_pretrained(resume_from_checkpoint)
@@ -965,12 +963,12 @@ def train(
 
         delay_optimizer_creation = self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE
         if self.args.deepspeed:
-            model, optimizer, lr_scheduler = init_deepspeed(
+            deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
                 self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
             )
-            self.model = model.module
-            self.model_wrapped = model
-            self.deepspeed = model  # DeepSpeedEngine object
+            self.model = deepspeed_engine.module
+            self.model_wrapped = deepspeed_engine
+            self.deepspeed = deepspeed_engine
             self.optimizer = optimizer
             self.lr_scheduler = lr_scheduler
         elif not delay_optimizer_creation:
@@ -1151,17 +1149,21 @@ def train(
                             )
 
                     # Optimizer step
+                    optimizer_was_run = True
                     if self.deepspeed:
                         pass  # called outside the loop
                     elif is_torch_tpu_available():
                         xm.optimizer_step(self.optimizer)
                     elif self.use_amp:
+                        scale_before = self.scaler.get_scale()
                         self.scaler.step(self.optimizer)
                         self.scaler.update()
+                        scale_after = self.scaler.get_scale()
+                        optimizer_was_run = scale_before <= scale_after
                     else:
                         self.optimizer.step()
 
-                    if not self.deepspeed:
+                    if optimizer_was_run and not self.deepspeed:
                         self.lr_scheduler.step()
 
                     model.zero_grad()
@@ -1227,18 +1229,6 @@ def train(
         # add remaining tr_loss
         self._total_loss_scalar += tr_loss.item()
 
-        if self.deepspeed:
-            # free up any memory that might be useful for eval
-            self.deepspeed = None
-            self.optimizer = None
-            self.lr_scheduler = None
-            self.model_wrapped = self.model
-            gc.collect()  # force memory release
-            # to restore normal behavior outside of train replay the place_model_on_device logic w/o deepspeed
-            self.place_model_on_device = self.args.place_model_on_device
-            if self.is_model_parallel:
-                self.place_model_on_device = False
-
         self.is_in_train = False
 
         self._memory_tracker.stop_and_update_metrics(metrics)
@@ -1293,6 +1283,8 @@ def _save_checkpoint(self, model, trial, metrics=None):
         output_dir = os.path.join(run_dir, checkpoint_folder)
         self.save_model(output_dir)
         if self.deepspeed:
+            # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
+            # config `stage3_gather_fp16_weights_on_model_save` is True
             self.deepspeed.save_checkpoint(output_dir)
 
         # Save optimizer and scheduler
@@ -1351,7 +1343,7 @@ def _load_optimizer_and_scheduler(self, checkpoint):
             return
 
         if self.deepspeed:
-            # deepspeed loads optimizer/lr_scheduler together with the model in init_deepspeed
+            # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
             return
 
         if os.path.isfile(os.path.join(checkpoint, "optimizer.pt")) and os.path.isfile(
@@ -1597,6 +1589,10 @@ def save_model(self, output_dir: Optional[str] = None):
 
         Will only save from the main process.
         """
+
+        if output_dir is None:
+            output_dir = self.args.output_dir
+
         if is_torch_tpu_available():
             self._save_tpu(output_dir)
         elif is_sagemaker_mp_enabled():
@@ -1608,8 +1604,31 @@ def save_model(self, output_dir: Optional[str] = None):
             ShardedDDPOption.ZERO_DP_2 in self.args.sharded_ddp or ShardedDDPOption.ZERO_DP_3 in self.args.sharded_ddp
         ):
             state_dict = self.model.state_dict()
+
             if self.is_world_process_zero():
                 self._save(output_dir, state_dict=state_dict)
+        elif self.deepspeed:
+
+            # this takes care of everything as long as we aren't under zero3
+            if self.is_world_process_zero():
+                self._save(output_dir)
+
+            if is_deepspeed_zero3_enabled():
+                # It's too complicated to try to override different places where the weights dump gets
+                # saved, so since under zero3 the file is bogus, simply delete it. The user should
+                # either user deepspeed checkpoint to resume or to recover full weights use
+                # zero_to_fp32.py stored in the checkpoint.
+                if self.is_world_process_zero():
+                    file = os.path.join(output_dir, WEIGHTS_NAME)
+                    if os.path.isfile(file):
+                        # logger.info(f"deepspeed zero3: removing {file}, see zero_to_fp32.py to recover weights")
+                        os.remove(file)
+
+                # now save the real model if stage3_gather_fp16_weights_on_model_save=True
+                # if false it will not be saved.
+                # This must be called on all ranks
+                self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME)
+
         elif self.is_world_process_zero():
             self._save(output_dir)
 
@@ -1848,10 +1867,20 @@ def prediction_loop(
             prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
         )
 
-        if self.args.deepspeed and not self.args.do_train:
-            # no harm, but flagging to the user that deepspeed config is ignored for eval
-            # flagging only for when --do_train wasn't passed as only then it's redundant
-            logger.info("Detected the deepspeed argument but it will not be used for evaluation")
+        # if eval is called w/o train init deepspeed here
+        if self.args.deepspeed and not self.deepspeed:
+
+            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
+            # from the checkpoint eventually
+            deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None)
+            self.model = deepspeed_engine.module
+            self.model_wrapped = deepspeed_engine
+            self.deepspeed = deepspeed_engine
+            # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since
+            # for example the Z3-optimizer is a must for zero3 to work even for inference - what we
+            # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer
+            deepspeed_engine.optimizer.optimizer = None
+            deepspeed_engine.lr_scheduler = None
 
         model = self._wrap_model(self.model, training=False)
 
@@ -1966,7 +1995,7 @@ def prediction_step(
         inputs: Dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
         ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform an evaluation step on :obj:`model` using obj:`inputs`.
 
@@ -1987,8 +2016,8 @@ def prediction_step(
                 gathering predictions.
 
         Return:
-            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
-            labels (each being optional).
+            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+            logits and labels (each being optional).
         """
         has_labels = all(inputs.get(k) is not None for k in self.label_names)
         inputs = self._prepare_inputs(inputs)
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 9409f8aaf693aa..151dbf52a0c82e 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -289,7 +289,7 @@ def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
         self.eval_dataloader = None
 
         if not any(isinstance(cb, DefaultFlowCallback) for cb in self.callbacks):
-            logger.warn(
+            logger.warning(
                 "The Trainer will not work properly if you don't have a `DefaultFlowCallback` in its callbacks. You\n"
                 + "should add one before training with `trainer.add_callback(DefaultFlowCallback). The current list of"
                 + "callbacks is\n:"
@@ -300,7 +300,7 @@ def add_callback(self, callback):
         cb = callback() if isinstance(callback, type) else callback
         cb_class = callback if isinstance(callback, type) else callback.__class__
         if cb_class in [c.__class__ for c in self.callbacks]:
-            logger.warn(
+            logger.warning(
                 f"You are adding a {cb_class} to the callbacks of this Trainer, but there is already one. The current"
                 + "list of callbacks is\n:"
                 + self.callback_list
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index eedbb616fe548d..0d3fe6407c66f4 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -391,7 +391,7 @@ def finalize(self):
         if self._storage is None:
             return
         if self._offsets[0] != self.process_length:
-            logger.warn("Not all data has been set. Are you sure you passed all values?")
+            logger.warning("Not all data has been set. Are you sure you passed all values?")
         return nested_truncate(self._storage, self.num_samples)
 
 
@@ -589,7 +589,7 @@ def _get_learning_rate(self):
             last_lr = self.lr_scheduler.get_last_lr()[0]
         except AssertionError as e:
             if "need to call step" in str(e):
-                logger.warn("tried to get lr value before scheduler/optimizer started stepping, returning lr=0")
+                logger.warning("tried to get lr value before scheduler/optimizer started stepping, returning lr=0")
                 last_lr = 0
             else:
                 raise
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index b4399c80eddf02..1298c62fc5cff0 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -19,6 +19,7 @@
 from torch import nn
 from torch.utils.data.dataset import Dataset
 
+from .integrations import is_deepspeed_zero3_enabled
 from .trainer import Trainer
 from .trainer_utils import PredictionOutput
 from .utils import logging
@@ -156,9 +157,11 @@ def prediction_step(
         has_labels = "labels" in inputs
         inputs = self._prepare_inputs(inputs)
 
+        # XXX: adapt synced_gpus for fairscale as well
         gen_kwargs = {
             "max_length": self._max_length if self._max_length is not None else self.model.config.max_length,
             "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams,
+            "synced_gpus": True if is_deepspeed_zero3_enabled() else False,
         }
 
         generated_tokens = self.model.generate(
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 9e5535529065de..188bf92b63df05 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -531,6 +531,12 @@ class TrainingArguments:
     )
 
     def __post_init__(self):
+        # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
+        # This needs to happen before any call to self.device or self.n_gpu.
+        env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+        if env_local_rank != -1 and env_local_rank != self.local_rank:
+            self.local_rank = env_local_rank
+
         # expand paths, if not os.makedirs("~/bar") will make directory
         # in the current directory instead of the actual home
         #  see https://github.com/huggingface/transformers/issues/10628
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 848724d3f54371..4c598415d554b3 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -23,7 +23,7 @@ def check_min_version(min_version):
     if version.parse(__version__) < version.parse(min_version):
         if "dev" in min_version:
             error_message = (
-                "This example requires a source install from 🤗 Transformers (see "
+                "This example requires a source install from HuggingFace Transformers (see "
                 "`https://huggingface.co/transformers/installation.html#installing-from-source`),"
             )
         else:
@@ -33,6 +33,6 @@ def check_min_version(min_version):
             error_message
             + (
                 "Check out https://huggingface.co/transformers/examples.html for the examples corresponding to other "
-                "versions of 🤗 Transformers."
+                "versions of HuggingFace Transformers."
             )
         )
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index 8649d1c5e53f71..1b1e61b6298693 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -1,14 +1,14 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..file_utils import requires_flax
+from ..file_utils import requires_backends
 
 
 class FlaxPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 FLAX_MODEL_FOR_MASKED_LM_MAPPING = None
@@ -37,153 +37,153 @@ def from_pretrained(self, *args, **kwargs):
 
 class FlaxAutoModel:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxAutoModelForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxAutoModelForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxAutoModelForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxAutoModelForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxAutoModelForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxAutoModelForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxAutoModelForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertModel:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
 
 class FlaxRobertaModel:
     def __init__(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_flax(self)
+        requires_backends(self, ["flax"])
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 942d267cfad426..ac8ee4d488c19d 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1,247 +1,247 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..file_utils import requires_pytorch
+from ..file_utils import requires_backends
 
 
 class PyTorchBenchmark:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class PyTorchBenchmarkArguments:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DataCollator:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DataCollatorForLanguageModeling:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DataCollatorForPermutationLanguageModeling:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DataCollatorForSeq2Seq:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DataCollatorForSOP:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DataCollatorForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DataCollatorForWholeWordMask:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DataCollatorWithPadding:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def default_data_collator(*args, **kwargs):
-    requires_pytorch(default_data_collator)
+    requires_backends(default_data_collator, ["torch"])
 
 
 class GlueDataset:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class GlueDataTrainingArguments:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LineByLineTextDataset:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LineByLineWithRefDataset:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LineByLineWithSOPTextDataset:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SquadDataset:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SquadDataTrainingArguments:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TextDataset:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TextDatasetForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BeamScorer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BeamSearchScorer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ForcedBOSTokenLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ForcedEOSTokenLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class HammingDiversityLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class InfNanRemoveLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LogitsProcessorList:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LogitsWarper:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MinLengthLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class NoBadWordsLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class NoRepeatNGramLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class PrefixConstrainedLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RepetitionPenaltyLogitsProcessor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TemperatureLogitsWarper:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TopKLogitsWarper:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TopPLogitsWarper:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MaxLengthCriteria:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MaxTimeCriteria:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class StoppingCriteria:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class StoppingCriteriaList:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def top_k_top_p_filtering(*args, **kwargs):
-    requires_pytorch(top_k_top_p_filtering)
+    requires_backends(top_k_top_p_filtering, ["torch"])
 
 
 class Conv1D:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class PreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def apply_chunking_to_forward(*args, **kwargs):
-    requires_pytorch(apply_chunking_to_forward)
+    requires_backends(apply_chunking_to_forward, ["torch"])
 
 
 def prune_layer(*args, **kwargs):
-    requires_pytorch(prune_layer)
+    requires_backends(prune_layer, ["torch"])
 
 
 ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -249,74 +249,74 @@ def prune_layer(*args, **kwargs):
 
 class AlbertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AlbertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AlbertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AlbertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AlbertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AlbertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AlbertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AlbertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_albert(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_albert)
+    requires_backends(load_tf_weights_in_albert, ["torch"])
 
 
 MODEL_FOR_CAUSAL_LM_MAPPING = None
@@ -360,110 +360,110 @@ def load_tf_weights_in_albert(*args, **kwargs):
 
 class AutoModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForSeq2SeqLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForTableQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AutoModelWithLMHead:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 BART_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -471,61 +471,61 @@ def from_pretrained(self, *args, **kwargs):
 
 class BartForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BartForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BartForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BartForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BartModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BartPretrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class PretrainedBartModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -533,107 +533,107 @@ def from_pretrained(self, *args, **kwargs):
 
 class BertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertLayer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_bert(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_bert)
+    requires_backends(load_tf_weights_in_bert, ["torch"])
 
 
 class BertGenerationDecoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BertGenerationEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_bert_generation(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_bert_generation)
+    requires_backends(load_tf_weights_in_bert_generation, ["torch"])
 
 
 BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -641,84 +641,84 @@ def load_tf_weights_in_bert_generation(*args, **kwargs):
 
 class BigBirdForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdLayer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BigBirdPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_big_bird(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_big_bird)
+    requires_backends(load_tf_weights_in_big_bird, ["torch"])
 
 
 BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -726,25 +726,25 @@ def load_tf_weights_in_big_bird(*args, **kwargs):
 
 class BlenderbotForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BlenderbotForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BlenderbotModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -752,25 +752,25 @@ def from_pretrained(self, *args, **kwargs):
 
 class BlenderbotSmallForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BlenderbotSmallForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class BlenderbotSmallModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -778,61 +778,61 @@ def from_pretrained(self, *args, **kwargs):
 
 class CamembertForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CamembertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CamembertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CamembertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CamembertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CamembertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CamembertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -840,74 +840,74 @@ def from_pretrained(self, *args, **kwargs):
 
 class ConvBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ConvBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ConvBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ConvBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ConvBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ConvBertLayer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ConvBertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ConvBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_convbert(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_convbert)
+    requires_backends(load_tf_weights_in_convbert, ["torch"])
 
 
 CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -915,38 +915,38 @@ def load_tf_weights_in_convbert(*args, **kwargs):
 
 class CTRLForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CTRLLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CTRLModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class CTRLPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -954,56 +954,56 @@ def from_pretrained(self, *args, **kwargs):
 
 class DebertaForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1011,56 +1011,56 @@ def from_pretrained(self, *args, **kwargs):
 
 class DebertaV2ForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaV2ForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaV2ForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaV2ForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaV2Model:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DebertaV2PreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1068,65 +1068,65 @@ def from_pretrained(self, *args, **kwargs):
 
 class DistilBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DistilBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DistilBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DistilBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DistilBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DistilBertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DistilBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1140,32 +1140,32 @@ def from_pretrained(self, *args, **kwargs):
 
 class DPRContextEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DPRPretrainedContextEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DPRPretrainedQuestionEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DPRPretrainedReader:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DPRQuestionEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class DPRReader:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1173,83 +1173,83 @@ def __init__(self, *args, **kwargs):
 
 class ElectraForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ElectraForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ElectraForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ElectraForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ElectraForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ElectraForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ElectraModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ElectraPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_electra(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_electra)
+    requires_backends(load_tf_weights_in_electra, ["torch"])
 
 
 class EncoderDecoderModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1257,92 +1257,92 @@ def from_pretrained(self, *args, **kwargs):
 
 class FlaubertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FlaubertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FlaubertForQuestionAnsweringSimple:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FlaubertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FlaubertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FlaubertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FlaubertWithLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FSMTForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FSMTModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class PretrainedFSMTModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1350,74 +1350,74 @@ def from_pretrained(self, *args, **kwargs):
 
 class FunnelBaseModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FunnelForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FunnelForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FunnelForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FunnelForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FunnelForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FunnelForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class FunnelModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_funnel(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_funnel)
+    requires_backends(load_tf_weights_in_funnel, ["torch"])
 
 
 GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1425,51 +1425,51 @@ def load_tf_weights_in_funnel(*args, **kwargs):
 
 class GPT2DoubleHeadsModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class GPT2ForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class GPT2LMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class GPT2Model:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class GPT2PreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_gpt2(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_gpt2)
+    requires_backends(load_tf_weights_in_gpt2, ["torch"])
 
 
 GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1477,29 +1477,29 @@ def load_tf_weights_in_gpt2(*args, **kwargs):
 
 class GPTNeoForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class GPTNeoModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class GPTNeoPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_gpt_neo(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_gpt_neo)
+    requires_backends(load_tf_weights_in_gpt_neo, ["torch"])
 
 
 IBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1507,65 +1507,65 @@ def load_tf_weights_in_gpt_neo(*args, **kwargs):
 
 class IBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class IBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class IBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class IBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class IBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class IBertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class IBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1573,38 +1573,38 @@ def from_pretrained(self, *args, **kwargs):
 
 class LayoutLMForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LayoutLMForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LayoutLMForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LayoutLMModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 LED_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1612,38 +1612,38 @@ def from_pretrained(self, *args, **kwargs):
 
 class LEDForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LEDForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LEDForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LEDModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1651,108 +1651,108 @@ def from_pretrained(self, *args, **kwargs):
 
 class LongformerForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LongformerForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LongformerForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LongformerForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LongformerForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LongformerModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LongformerSelfAttention:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LxmertEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LxmertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LxmertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LxmertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LxmertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LxmertVisualFeatureEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class LxmertXLayer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1760,103 +1760,175 @@ def __init__(self, *args, **kwargs):
 
 class M2M100ForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class M2M100Model:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MarianForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MarianModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MarianMTModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MBartForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MBartForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MBartForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MBartForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MBartModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
+
+
+MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MegatronBertForCausalLM:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegatronBertForMaskedLM:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegatronBertForMultipleChoice:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegatronBertForNextSentencePrediction:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegatronBertForPreTraining:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegatronBertForQuestionAnswering:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegatronBertForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegatronBertForTokenClassification:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MegatronBertModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 
 
 class MMBTForClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MMBTModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ModalEmbeddings:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1864,84 +1936,84 @@ def __init__(self, *args, **kwargs):
 
 class MobileBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertLayer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MobileBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_mobilebert(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_mobilebert)
+    requires_backends(load_tf_weights_in_mobilebert, ["torch"])
 
 
 MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1949,97 +2021,97 @@ def load_tf_weights_in_mobilebert(*args, **kwargs):
 
 class MPNetForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MPNetForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MPNetForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MPNetForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MPNetForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MPNetLayer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MPNetModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MPNetPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MT5EncoderModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MT5ForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MT5Model:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2047,74 +2119,74 @@ def from_pretrained(self, *args, **kwargs):
 
 class OpenAIGPTDoubleHeadsModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class OpenAIGPTForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class OpenAIGPTLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class OpenAIGPTModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class OpenAIGPTPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_openai_gpt(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_openai_gpt)
+    requires_backends(load_tf_weights_in_openai_gpt, ["torch"])
 
 
 class PegasusForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class PegasusForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class PegasusModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2122,63 +2194,63 @@ def from_pretrained(self, *args, **kwargs):
 
 class ProphetNetDecoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ProphetNetEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ProphetNetForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ProphetNetForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ProphetNetModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ProphetNetPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RagModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RagSequenceForGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RagTokenForGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2186,57 +2258,57 @@ def __init__(self, *args, **kwargs):
 
 class ReformerAttention:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ReformerForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ReformerForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ReformerForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ReformerLayer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ReformerModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ReformerModelWithLMHead:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2244,20 +2316,20 @@ def from_pretrained(self, *args, **kwargs):
 
 class RetriBertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RetriBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2265,61 +2337,61 @@ def from_pretrained(self, *args, **kwargs):
 
 class RobertaForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RobertaForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RobertaForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RobertaForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RobertaForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RobertaForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class RobertaModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2327,20 +2399,20 @@ def from_pretrained(self, *args, **kwargs):
 
 class Speech2TextForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class Speech2TextModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2348,70 +2420,70 @@ def from_pretrained(self, *args, **kwargs):
 
 class SqueezeBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SqueezeBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SqueezeBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SqueezeBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SqueezeBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SqueezeBertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SqueezeBertModule:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class SqueezeBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 T5_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2419,42 +2491,42 @@ def from_pretrained(self, *args, **kwargs):
 
 class T5EncoderModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class T5ForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class T5Model:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class T5PreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_t5(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_t5)
+    requires_backends(load_tf_weights_in_t5, ["torch"])
 
 
 TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2462,38 +2534,38 @@ def load_tf_weights_in_t5(*args, **kwargs):
 
 class TapasForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TapasForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TapasForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TapasModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2501,47 +2573,47 @@ def from_pretrained(self, *args, **kwargs):
 
 class AdaptiveEmbedding:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TransfoXLForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TransfoXLLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TransfoXLModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class TransfoXLPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_transfo_xl(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_transfo_xl)
+    requires_backends(load_tf_weights_in_transfo_xl, ["torch"])
 
 
 VIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2549,25 +2621,25 @@ def load_tf_weights_in_transfo_xl(*args, **kwargs):
 
 class ViTForImageClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ViTModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class ViTPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2575,34 +2647,34 @@ def from_pretrained(self, *args, **kwargs):
 
 class Wav2Vec2ForCTC:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class Wav2Vec2ForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class Wav2Vec2Model:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class Wav2Vec2PreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 XLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2610,74 +2682,74 @@ def from_pretrained(self, *args, **kwargs):
 
 class XLMForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMForQuestionAnsweringSimple:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMWithLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2685,35 +2757,35 @@ def from_pretrained(self, *args, **kwargs):
 
 class XLMProphetNetDecoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMProphetNetEncoder:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMProphetNetForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMProphetNetForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMProphetNetModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2721,61 +2793,61 @@ def from_pretrained(self, *args, **kwargs):
 
 class XLMRobertaForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMRobertaForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMRobertaForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMRobertaForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMRobertaForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMRobertaForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLMRobertaModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -2783,127 +2855,127 @@ def from_pretrained(self, *args, **kwargs):
 
 class XLNetForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLNetForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLNetForQuestionAnsweringSimple:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLNetForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLNetForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLNetLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLNetModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class XLNetPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def load_tf_weights_in_xlnet(*args, **kwargs):
-    requires_pytorch(load_tf_weights_in_xlnet)
+    requires_backends(load_tf_weights_in_xlnet, ["torch"])
 
 
 class Adafactor:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class AdamW:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def get_constant_schedule(*args, **kwargs):
-    requires_pytorch(get_constant_schedule)
+    requires_backends(get_constant_schedule, ["torch"])
 
 
 def get_constant_schedule_with_warmup(*args, **kwargs):
-    requires_pytorch(get_constant_schedule_with_warmup)
+    requires_backends(get_constant_schedule_with_warmup, ["torch"])
 
 
 def get_cosine_schedule_with_warmup(*args, **kwargs):
-    requires_pytorch(get_cosine_schedule_with_warmup)
+    requires_backends(get_cosine_schedule_with_warmup, ["torch"])
 
 
 def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
-    requires_pytorch(get_cosine_with_hard_restarts_schedule_with_warmup)
+    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch"])
 
 
 def get_linear_schedule_with_warmup(*args, **kwargs):
-    requires_pytorch(get_linear_schedule_with_warmup)
+    requires_backends(get_linear_schedule_with_warmup, ["torch"])
 
 
 def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
-    requires_pytorch(get_polynomial_decay_schedule_with_warmup)
+    requires_backends(get_polynomial_decay_schedule_with_warmup, ["torch"])
 
 
 def get_scheduler(*args, **kwargs):
-    requires_pytorch(get_scheduler)
+    requires_backends(get_scheduler, ["torch"])
 
 
 class Trainer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 def torch_distributed_zero_first(*args, **kwargs):
-    requires_pytorch(torch_distributed_zero_first)
+    requires_backends(torch_distributed_zero_first, ["torch"])
 
 
 class Seq2SeqTrainer:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
diff --git a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
new file mode 100644
index 00000000000000..b030ce604a584c
--- /dev/null
+++ b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
@@ -0,0 +1,7 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..file_utils import requires_backends
+
+
+class Speech2TextProcessor:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece", "speech"])
diff --git a/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py b/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
new file mode 100644
index 00000000000000..0cb93ec194f9d0
--- /dev/null
+++ b/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
@@ -0,0 +1,9 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..file_utils import requires_backends
+
+
+SLOW_TO_FAST_CONVERTERS = None
+
+
+def convert_slow_tokenizer(*args, **kwargs):
+    requires_backends(convert_slow_tokenizer, ["sentencepiece", "tokenizers"])
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 2ef3165d7f087c..d87263c8c74037 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -1,160 +1,155 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..file_utils import requires_sentencepiece
+from ..file_utils import requires_backends
 
 
 class AlbertTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class BarthezTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class BertGenerationTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class CamembertTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class DebertaV2Tokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class M2M100Tokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class MarianTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class MBart50Tokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class MBartTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class MT5Tokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class PegasusTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class ReformerTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
-
-
-class Speech2TextProcessor:
-    def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class Speech2TextTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class T5Tokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class XLMProphetNetTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class XLMRobertaTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
 
 class XLNetTokenizer:
     def __init__(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_sentencepiece(self)
+        requires_backends(self, ["sentencepiece"])
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
new file mode 100644
index 00000000000000..9dd744f1997b9c
--- /dev/null
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -0,0 +1,7 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..file_utils import requires_backends
+
+
+class Speech2TextFeatureExtractor:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index deeea052130ee7..d9124ec7d024be 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1,19 +1,19 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..file_utils import requires_tf
+from ..file_utils import requires_backends
 
 
 class TensorFlowBenchmarkArguments:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TensorFlowBenchmark:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 def tf_top_k_top_p_filtering(*args, **kwargs):
-    requires_tf(tf_top_k_top_p_filtering)
+    requires_backends(tf_top_k_top_p_filtering, ["tf"])
 
 
 TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -21,75 +21,75 @@ def tf_top_k_top_p_filtering(*args, **kwargs):
 
 class TFLayoutLMForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLayoutLMForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLayoutLMForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLayoutLMMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLayoutLMModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLayoutLMPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFSequenceSummary:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFSharedEmbeddings:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 def shape_list(*args, **kwargs):
-    requires_tf(shape_list)
+    requires_backends(shape_list, ["tf"])
 
 
 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -97,75 +97,75 @@ def shape_list(*args, **kwargs):
 
 class TFAlbertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAlbertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAlbertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAlbertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAlbertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAlbertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAlbertMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAlbertModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAlbertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_MODEL_FOR_CAUSAL_LM_MAPPING = None
@@ -203,119 +203,119 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFAutoModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelForCausalLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelForSeq2SeqLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFAutoModelWithLMHead:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBartForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBartModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBartPretrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -323,130 +323,130 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFBertEmbeddings:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBlenderbotForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBlenderbotModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBlenderbotSmallForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFBlenderbotSmallModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -454,56 +454,56 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFCamembertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFCamembertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFCamembertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFCamembertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFCamembertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFCamembertModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -511,70 +511,70 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFConvBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFConvBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFConvBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFConvBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFConvBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFConvBertLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFConvBertModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFConvBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -582,38 +582,38 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFCTRLForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFCTRLLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFCTRLModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFCTRLPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -621,70 +621,70 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFDistilBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDistilBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDistilBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDistilBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDistilBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDistilBertMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDistilBertModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDistilBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -698,32 +698,32 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFDPRContextEncoder:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDPRPretrainedContextEncoder:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDPRPretrainedQuestionEncoder:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDPRPretrainedReader:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDPRQuestionEncoder:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFDPRReader:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -731,70 +731,70 @@ def __init__(self, *args, **kwargs):
 
 class TFElectraForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFElectraForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFElectraForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFElectraForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFElectraForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFElectraForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFElectraModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFElectraPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -802,56 +802,56 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFFlaubertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFlaubertForQuestionAnsweringSimple:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFlaubertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFlaubertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFlaubertModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFlaubertWithLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -859,70 +859,70 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFFunnelBaseModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFunnelForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFunnelForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFunnelForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFunnelForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFunnelForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFunnelForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFFunnelModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -930,79 +930,79 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFGPT2DoubleHeadsModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFGPT2ForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFGPT2LMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFGPT2MainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFGPT2Model:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFGPT2PreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLEDForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLEDModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLEDPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1010,61 +1010,61 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFLongformerForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLongformerForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLongformerForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLongformerForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLongformerForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLongformerModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLongformerSelfAttention:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1072,71 +1072,71 @@ def __init__(self, *args, **kwargs):
 
 class TFLxmertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLxmertMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLxmertModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLxmertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFLxmertVisualFeatureEncoder:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMarianModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMarianMTModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMBartForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMBartModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1144,80 +1144,80 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFMobileBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMobileBertPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1225,97 +1225,97 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFMPNetForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMPNetForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMPNetForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMPNetForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMPNetForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMPNetMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMPNetModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMPNetPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMT5EncoderModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMT5ForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFMT5Model:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1323,89 +1323,89 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFOpenAIGPTDoubleHeadsModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFOpenAIGPTForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFOpenAIGPTLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFOpenAIGPTMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFOpenAIGPTModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFOpenAIGPTPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFPegasusForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFPegasusModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRagModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRagSequenceForGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRagTokenForGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1413,70 +1413,70 @@ def __init__(self, *args, **kwargs):
 
 class TFRobertaForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRobertaForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRobertaForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRobertaForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRobertaForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRobertaMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRobertaModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFRobertaPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1484,38 +1484,38 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFT5EncoderModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFT5ForConditionalGeneration:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFT5Model:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFT5PreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1523,48 +1523,48 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFAdaptiveEmbedding:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFTransfoXLForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFTransfoXLLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFTransfoXLMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFTransfoXLModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFTransfoXLPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1572,70 +1572,70 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFXLMForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMForQuestionAnsweringSimple:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMWithLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1643,56 +1643,56 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFXLMRobertaForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMRobertaForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMRobertaForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMRobertaForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMRobertaForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLMRobertaModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1700,91 +1700,91 @@ def from_pretrained(self, *args, **kwargs):
 
 class TFXLNetForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLNetForQuestionAnsweringSimple:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLNetForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLNetForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLNetLMHeadModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLNetMainLayer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLNetModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class TFXLNetPreTrainedModel:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class AdamWeightDecay:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class GradientAccumulator:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 class WarmUp:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
 
 
 def create_optimizer(*args, **kwargs):
-    requires_tf(create_optimizer)
+    requires_backends(create_optimizer, ["tf"])
 
 
 class TFTrainer:
     def __init__(self, *args, **kwargs):
-        requires_tf(self)
+        requires_backends(self, ["tf"])
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index d9a1b8c055e619..3ebd824720b32b 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -1,306 +1,299 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..file_utils import requires_tokenizers
+from ..file_utils import requires_backends
 
 
 class AlbertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class BartTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class BarthezTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class BertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class CamembertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class ConvBertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class DistilBertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class DPRContextEncoderTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class DPRQuestionEncoderTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class DPRReaderTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class ElectraTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class FunnelTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class GPT2TokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class HerbertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class LayoutLMTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class LEDTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class LongformerTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class LxmertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class MBart50TokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class MBartTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class MobileBertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class MPNetTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class MT5TokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class OpenAIGPTTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class PegasusTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class ReformerTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class RetriBertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class RobertaTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class SqueezeBertTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class T5TokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class XLMRobertaTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class XLNetTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
 
 class PreTrainedTokenizerFast:
     def __init__(self, *args, **kwargs):
-        requires_tokenizers(self)
+        requires_backends(self, ["tokenizers"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_tokenizers(self)
-
-
-SLOW_TO_FAST_CONVERTERS = None
-
-
-def convert_slow_tokenizer(*args, **kwargs):
-    requires_tokenizers(convert_slow_tokenizer)
+        requires_backends(self, ["tokenizers"])
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index d05d43f2046fbb..49d0f6f6c807d6 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -1,12 +1,12 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..file_utils import requires_vision
+from ..file_utils import requires_backends
 
 
 class ImageFeatureExtractionMixin:
     def __init__(self, *args, **kwargs):
-        requires_vision(self)
+        requires_backends(self, ["vision"])
 
 
 class ViTFeatureExtractor:
     def __init__(self, *args, **kwargs):
-        requires_vision(self)
+        requires_backends(self, ["vision"])
diff --git a/src/transformers/utils/modeling_auto_mapping.py b/src/transformers/utils/modeling_auto_mapping.py
index 189b2e1959f4fd..0a05ac24d795ee 100644
--- a/src/transformers/utils/modeling_auto_mapping.py
+++ b/src/transformers/utils/modeling_auto_mapping.py
@@ -21,6 +21,7 @@
         ("BertConfig", "BertForQuestionAnswering"),
         ("XLNetConfig", "XLNetForQuestionAnsweringSimple"),
         ("FlaubertConfig", "FlaubertForQuestionAnsweringSimple"),
+        ("MegatronBertConfig", "MegatronBertForQuestionAnswering"),
         ("MobileBertConfig", "MobileBertForQuestionAnswering"),
         ("XLMConfig", "XLMForQuestionAnsweringSimple"),
         ("ElectraConfig", "ElectraForQuestionAnswering"),
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index 9912b736b39443..91e85a5d7ac3b4 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -33,15 +33,6 @@ def html_progress_bar(value, total, prefix, label, width=300):
     # docstyle-ignore
     return f"""
     <div>
-        <style>
-            /* Turns off some styling */
-            progress {{
-                /* gets rid of default border in Firefox and Opera. */
-                border: none;
-                /* Needs to be in here for Safari polyfill so background images work as expected. */
-                background-size: auto;
-            }}
-        </style>
       {prefix}
       <progress value='{value}' max='{total}' style='width:{width}px; height:20px; vertical-align: middle;'></progress>
       {label}
@@ -327,6 +318,8 @@ def on_evaluate(self, args, state, control, metrics=None, **kwargs):
             values["Validation Loss"] = metrics["eval_loss"]
             _ = metrics.pop("total_flos", None)
             _ = metrics.pop("epoch", None)
+            _ = metrics.pop("eval_runtime", None)
+            _ = metrics.pop("eval_samples_per_second", None)
             for k, v in metrics.items():
                 if k == "eval_loss":
                     values["Validation Loss"] = v
diff --git a/src/transformers/utils/versions.py b/src/transformers/utils/versions.py
index 028dbcc6c836a4..73151487bc71f2 100644
--- a/src/transformers/utils/versions.py
+++ b/src/transformers/utils/versions.py
@@ -40,6 +40,17 @@
 }
 
 
+def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint):
+    if got_ver is None:
+        raise ValueError("got_ver is None")
+    if want_ver is None:
+        raise ValueError("want_ver is None")
+    if not ops[op](version.parse(got_ver), version.parse(want_ver)):
+        raise ImportError(
+            f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
+        )
+
+
 def require_version(requirement: str, hint: Optional[str] = None) -> None:
     """
     Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
@@ -49,12 +60,13 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
     Args:
         requirement (:obj:`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
         hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met
-    """
 
-    # note: while pkg_resources.require_version(requirement) is a much simpler way to do it, it
-    # fails if some of the dependencies of the dependencies are not matching, which is not necessarily
-    # bad, hence the more complicated check - which also should be faster, since it doesn't check
-    # dependencies of dependencies.
+    Example::
+
+       require_version("pandas>1.1.2")
+       require_version("numpy>1.18.5", "this is important to have for whatever reason")
+
+    """
 
     hint = f"\n{hint}" if hint is not None else ""
 
@@ -62,22 +74,30 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
     if re.match(r"^[\w_\-\d]+$", requirement):
         pkg, op, want_ver = requirement, None, None
     else:
-        match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2})(.+)", requirement)
+        match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement)
         if not match:
             raise ValueError(
                 f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}"
             )
-        pkg, op, want_ver = match[0]
-        if op not in ops:
-            raise ValueError(f"need one of {list(ops.keys())}, but got {op}")
+        pkg, want_full = match[0]
+        want_range = want_full.split(",")  # there could be multiple requirements
+        wanted = {}
+        for w in want_range:
+            match = re.findall(r"^([\s!=<>]{1,2})(.+)", w)
+            if not match:
+                raise ValueError(
+                    f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}"
+                )
+            op, want_ver = match[0]
+            wanted[op] = want_ver
+            if op not in ops:
+                raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}")
 
     # special case
     if pkg == "python":
         got_ver = ".".join([str(x) for x in sys.version_info[:3]])
-        if not ops[op](version.parse(got_ver), version.parse(want_ver)):
-            raise ImportError(
-                f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}."
-            )
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
         return
 
     # check if any version is installed
@@ -88,11 +108,10 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
             f"The '{requirement}' distribution was not found and is required by this application. {hint}"
         )
 
-    # check that the right version is installed if version number was provided
-    if want_ver is not None and not ops[op](version.parse(got_ver), version.parse(want_ver)):
-        raise ImportError(
-            f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
-        )
+    # check that the right version is installed if version number or a range was provided
+    if want_ver is not None:
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
 
 
 def require_version_core(requirement):
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index e8e0d56a4db748..005328b06d6c26 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -531,7 +531,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                         "`use_cache=False`..."
                     )
@@ -2512,7 +2512,7 @@ def forward(
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
 
                 if use_cache:
-                    logger.warn("`use_cache = True` is incompatible with `config.gradient_checkpointing = True`. Setting `use_cache = False`...")
+                    logger.warning("`use_cache = True` is incompatible with `config.gradient_checkpointing = True`. Setting `use_cache = False`...")
                     use_cache = False
 
                 def create_custom_forward(module):
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
index 7973c1e1dd4915..ec154a9b1c3118 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -225,12 +225,9 @@ def get_special_tokens_mask(
             :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
         if token_ids_1 is None:
             return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
index 67384736e738ae..7510fe44e9b78f 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
@@ -46,8 +46,7 @@ Tips:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+    :members:
 
 
 {% if "PyTorch" in cookiecutter.generate_tensorflow_and_pytorch -%}
diff --git a/examples/tests/deepspeed/ds_config.json b/tests/deepspeed/ds_config_zero2.json
similarity index 96%
rename from examples/tests/deepspeed/ds_config.json
rename to tests/deepspeed/ds_config_zero2.json
index 8c961be5518f8d..a516f33125ef61 100644
--- a/examples/tests/deepspeed/ds_config.json
+++ b/tests/deepspeed/ds_config_zero2.json
@@ -3,7 +3,7 @@
         "enabled": true,
         "loss_scale": 0,
         "loss_scale_window": 1000,
-        "initial_scale_power": 32,
+        "initial_scale_power": 16,
         "hysteresis": 2,
         "min_loss_scale": 1
     },
diff --git a/tests/deepspeed/ds_config_zero3.json b/tests/deepspeed/ds_config_zero3.json
new file mode 100644
index 00000000000000..0f909959521ef8
--- /dev/null
+++ b/tests/deepspeed/ds_config_zero3.json
@@ -0,0 +1,48 @@
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "cpu_offload": true,
+        "cpu_offload_params": true,
+        "cpu_offload_use_pin_memory" : true,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e14,
+        "reduce_bucket_size": 0,
+        "stage3_prefetch_bucket_size": 0,
+        "stage3_param_persistence_threshold": 0,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 3e-5,
+            "betas": [0.8, 0.999],
+            "eps": 1e-8,
+            "weight_decay": 3e-7
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 3e-5,
+            "warmup_num_steps": 500
+        }
+    },
+
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
new file mode 100644
index 00000000000000..9baaf3085b86a2
--- /dev/null
+++ b/tests/deepspeed/test_deepspeed.py
@@ -0,0 +1,637 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import io
+import json
+import os
+import unittest
+from copy import deepcopy
+
+from parameterized import parameterized
+from transformers import TrainingArguments, is_torch_available
+from transformers.file_utils import WEIGHTS_NAME
+from transformers.integrations import is_deepspeed_available
+from transformers.testing_utils import (
+    CaptureLogger,
+    ExtendSysPath,
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    mockenv_context,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+from transformers.trainer_utils import set_seed
+
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+
+    if is_torch_available():
+        from test_trainer import get_regression_trainer  # noqa
+
+
+set_seed(42)
+MBART_TINY = "sshleifer/tiny-mbart"
+T5_SMALL = "t5-small"
+
+
+def load_json(path):
+    with open(path) as f:
+        return json.load(f)
+
+
+# a candidate for testing_utils
+def require_deepspeed(test_case):
+    """
+    Decorator marking a test that requires deepspeed
+    """
+    if not is_deepspeed_available():
+        return unittest.skip("test requires deepspeed")(test_case)
+    else:
+        return test_case
+
+
+ZERO2 = "zero2"
+ZERO3 = "zero3"
+stages = [ZERO2, ZERO3]
+
+
+@require_deepspeed
+@require_torch_gpu
+class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
+    """
+
+    This class is for testing directly via get_regression_trainer
+
+    It mixes in `TrainerIntegrationCommon` which already has a lot of helper validation methods
+    which we can re-use here.
+
+    Important: this class' setup can only work with a single gpu because it runs within the current
+    pytest worker. For multi-gpu tests use TestDeepSpeedWithLauncher.
+
+    Note: if any of the tests of this class get run there will be at least one gpu occupied by them
+    until this pytest worker exits. This is because the gpu memory allocated by the cuda-kernels
+    won't be released until this pytest worker exits.
+
+    This may appear as some run-away tests if you watch `nvidia-smi` while other tests that fork new
+    processes are run. So there will be one or two "stale" processes reported in `nvidia-smi`. This
+    is not a bug.
+    """
+
+    def setUp(self):
+        super().setUp()
+
+        args = TrainingArguments(".")
+        self.n_epochs = args.num_train_epochs
+        self.batch_size = args.train_batch_size
+
+        self.dist_env_1_gpu = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+        )
+
+        self.ds_config_file = {}
+        self.ds_config_file[ZERO2] = f"{self.test_file_dir_str}/ds_config_zero2.json"
+        self.ds_config_file[ZERO3] = f"{self.test_file_dir_str}/ds_config_zero3.json"
+
+        # use self.get_config_dict(stage) to use these to ensure the original is not modified
+        self.ds_config_dict = {}
+        with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
+            self.ds_config_dict[ZERO2] = json.load(f)
+        with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
+            self.ds_config_dict[ZERO3] = json.load(f)
+
+    def get_config_dict(self, stage):
+        """ As the tests modify the dict, always make a copy """
+        config = deepcopy(self.ds_config_dict[stage])
+        if stage == ZERO3:
+            # This setting slows things down, so don't enable it by default unless needed by a test.
+            # It's in the file as a demo for users since we want everything to work out of the box even if slower.
+            config["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
+        return config
+
+    # --- These tests are enough to run on one of zero stages --- #
+
+    # Test various combos
+    # 1. DS scheduler + DS optimizer: this is already tested by most other tests
+    # 2. HF scheduler + HF optimizer:
+    # 3. DS scheduler + HF optimizer:
+    # 4. HF scheduler + DS optimizer:
+
+    def test_hf_scheduler_hf_optimizer(self):
+        a = 0
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
+            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_zero2_dict)
+            trainer.train()
+        new_a = trainer.model.a.item()
+        self.assertNotEqual(new_a, a)
+
+    def test_ds_scheduler_hf_optimizer(self):
+        a = 0
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
+            ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_zero2_dict)
+            trainer.train()
+        new_a = trainer.model.a.item()
+        self.assertNotEqual(new_a, a)
+
+    def test_hf_scheduler_ds_optimizer(self):
+        # this combo is not possible at the moment
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+        self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))
+
+    def test_hf_optimizer_with_offload(self):
+        # must not allow non-DS optimizer when using ZERO-offload
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
+            ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = True
+            # sanity check - should the default config change
+            assert (
+                "cpu_offload" in ds_config_zero2_dict["zero_optimization"]
+                and ds_config_zero2_dict["zero_optimization"]["cpu_offload"] is True
+            ), "ensure the config is set up correctly"
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+        self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
+
+    # --- These tests need to run on both zero stages --- #
+    @parameterized.expand(stages)
+    def test_fake_notebook_no_launcher(self, stage):
+        # this setup emulates a notebook where a launcher needs to be emulated by hand
+
+        # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
+        # DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if
+        # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
+        # to reset `logger.handlers[0].setStream(sys.stdout)` or directly capture from the logger.
+        from deepspeed.utils import logger
+
+        with CaptureLogger(logger) as cs:
+            with mockenv_context(**self.dist_env_1_gpu):
+                trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
+                trainer.train()
+        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+
+    @parameterized.expand(stages)
+    def test_early_get_last_lr(self, stage):
+        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
+        # not run for the first few dozen steps while loss scale is too large, and thus during
+        # that time `get_last_lr` will fail if called during that warm up stage,
+        #
+        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
+        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
+        with mockenv_context(**self.dist_env_1_gpu):
+            a = b = 0.0
+            trainer = get_regression_trainer(
+                a=a,
+                b=b,
+                local_rank=0,
+                train_len=8,
+                deepspeed=self.ds_config_file[stage],
+                per_device_train_batch_size=8,
+                logging_steps=1,
+            )
+            trainer.train()
+            post_train_a = trainer.model.a.item()
+
+            # XXX: for some reason the following check fails with zero3 - not a broken but a
+            # different qualitative outcome - need to investigate at some point
+            if stage == ZERO3:
+                return
+
+            # it's enough that train didn't fail for this test, but we must check that
+            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
+            self.assertEqual(post_train_a, a)
+
+    @parameterized.expand(stages)
+    def test_gradient_accumulation(self, stage):
+        # this test measures that we get identical weights and similar loss with:
+        # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
+        # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
+        # since the 2nd should produce the effective batch of 1st, with the same results
+        #
+        # I can get an identical loss for a small train_len=32, plus the power of the initial
+        # dynamic loss scale value set to:
+        #   "fp16.initial_scale_power": 1
+        # plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
+        # but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
+        # the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
+
+        train_len = 64
+        a = b = 0.0
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            no_grad_accum_trainer = get_regression_trainer(
+                a=a,
+                b=b,
+                local_rank=0,
+                train_len=train_len,
+                deepspeed=self.ds_config_file[stage],
+                per_device_train_batch_size=8,
+                gradient_accumulation_steps=1,
+            )
+            no_grad_accum_result = no_grad_accum_trainer.train()
+            no_grad_accum_loss = no_grad_accum_result.training_loss
+            no_grad_accum_a = no_grad_accum_trainer.model.a.item()
+            no_grad_accum_b = no_grad_accum_trainer.model.b.item()
+            # make sure the optimizer kicked in - if it hasn't changed from the original value of a then make train_len bigger
+            self.assertNotEqual(no_grad_accum_a, a)
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            yes_grad_accum_trainer = get_regression_trainer(
+                a=a,
+                b=b,
+                local_rank=0,
+                train_len=train_len,
+                deepspeed=self.ds_config_file[stage],
+                per_device_train_batch_size=4,
+                gradient_accumulation_steps=2,
+            )
+            yes_grad_accum_result = yes_grad_accum_trainer.train()
+            yes_grad_accum_loss = yes_grad_accum_result.training_loss
+            yes_grad_accum_a = yes_grad_accum_trainer.model.a.item()
+            yes_grad_accum_b = yes_grad_accum_trainer.model.b.item()
+            self.assertNotEqual(yes_grad_accum_a, a)
+
+        # training with half the batch size but accumulation steps as 2 should give the same weights
+        self.assertEqual(no_grad_accum_a, yes_grad_accum_a)
+        self.assertEqual(no_grad_accum_b, yes_grad_accum_b)
+
+        # see the note above how to get identical loss on a small bs
+        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)
+
+    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
+        # adapted from TrainerIntegrationCommon.check_saved_checkpoints
+
+        file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
+
+        if stage == ZERO2:
+            ds_file_list = ["mp_rank_00_model_states.pt"]
+        elif stage == ZERO3:
+            ds_file_list = ["zero_pp_rank_0_mp_rank_00_model_states.pt"]
+        else:
+            raise ValueError(f"unknown stage {stage}")
+
+        # XXX: this can be recoded and then removed once we require deepspeed>0.3.13
+        from packaging import version
+
+        import deepspeed
+
+        if version.parse(deepspeed.__version__) > version.parse("0.3.13"):
+            ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt")
+        else:
+            ds_file_list.append("zero_pp_rank_0_mp_rank_00optim_states.pt")
+
+        for step in range(freq, total, freq):
+            checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
+            self.assertTrue(os.path.isdir(checkpoint), f"[{stage}] {checkpoint} dir is not found")
+
+            # common files
+            for filename in file_list:
+                path = os.path.join(checkpoint, filename)
+                self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
+
+            # ds files
+            ds_path = os.path.join(checkpoint, f"global_step{step}")
+            for filename in ds_file_list:
+                # filename = os.path.join(path, filename)
+                # print(filename)
+                path = os.path.join(ds_path, filename)
+                self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
+
+    @parameterized.expand(stages)
+    def test_save_checkpoints(self, stage):
+        # adapted from  TrainerIntegrationTest.test_save_checkpoints
+
+        freq = 5
+        output_dir = self.get_auto_remove_tmp_dir()
+        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+        if stage == ZERO3:
+            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
+
+        # save checkpoints
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(
+                output_dir=output_dir,
+                save_steps=freq,
+                deepspeed=ds_config_dict,
+            )
+            trainer.train()
+
+        total = int(self.n_epochs * 64 / self.batch_size)
+        self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
+
+    @parameterized.expand(stages)
+    def test_can_resume_training_errors(self, stage):
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_dict = self.get_config_dict(stage)
+            output_dir = self.get_auto_remove_tmp_dir()
+            trainer = get_regression_trainer(output_dir=output_dir, deepspeed=ds_config_dict)
+
+            # 1. fail to find any checkpoint - due a fresh output_dir
+            with self.assertRaises(Exception) as context:
+                trainer.train(resume_from_checkpoint=True)
+            self.assertTrue(
+                "No valid checkpoint found in output directory" in str(context.exception),
+                f"got exception: {context.exception}",
+            )
+
+            # 2. fail to find a bogus checkpoint
+            with self.assertRaises(Exception) as context:
+                checkpoint = os.path.join(output_dir, "checkpoint-5")
+                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
+            self.assertTrue(
+                "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
+            )
+
+    @parameterized.expand(stages)
+    def test_can_resume_training_normal(self, stage):
+        # adapted from TrainerIntegrationTest.test_can_resume_training
+        # test normal resume for each stage separately, error-handling is tested in a different test
+        output_dir = self.get_auto_remove_tmp_dir()
+        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+        if stage == ZERO3:
+            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
+
+        kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(**kwargs)
+            trainer.train()
+            (a, b) = trainer.model.a.item(), trainer.model.b.item()
+            state = dataclasses.asdict(trainer.state)
+
+            checkpoint = os.path.join(output_dir, "checkpoint-5")
+
+            # Reinitialize trainer
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+            # Now check with a later checkpoint that it also works when we span over one epoch
+            checkpoint = os.path.join(output_dir, "checkpoint-15")
+
+            # Reinitialize trainer and load model
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+
+@slow
+@require_deepspeed
+@require_torch_gpu
+class TestDeepSpeedWithLauncher(TestCasePlus):
+    """ This class is for testing via an external script - can do multiple gpus """
+
+    # Tests to devise #
+    #
+    # 1. predict_with_generate on multigpu - need to figure out how to give input sequences so that
+    # the 2 gpus will generate prediction sequences that aren't of the same length - this is because
+    # we had to code a special feature to sync the gpus when the predicted sequences aren't of the
+    # same length. In general this will tested as a side-effect through a variety of other tests -
+    # it'll simply hang trying to synchronize with other gpus if this problem is encountered. So as
+    # long as we have a few full tests running on zero3 + predict_with_generate this should be
+    # mostly covered.
+    #
+    # but there are 5 variations on beam search in `generate`- with identical code branched with `if
+    # synced_gpus`
+    #
+    # 2. most tests should probably be run on both: zero2 and zero3 configs
+    #
+
+    @require_torch_multi_gpu
+    @parameterized.expand(stages)
+    def test_basic_distributed(self, stage):
+        self.run_and_check(stage=stage, distributed=True)
+
+    @parameterized.expand(stages)
+    def test_do_eval_no_train(self, stage):
+        # we should not fail if train is skipped
+        self.run_and_check(
+            stage=stage,
+            eval_steps=1,
+            distributed=False,
+            do_train=False,
+            do_eval=True,
+        )
+
+    @parameterized.expand(stages)
+    def test_resume_train_not_from_ds_checkpoint(self, stage):
+        # do normal training and then resume not from the deepspeed checkpoint but explicitly from
+        # the saved model dir
+
+        do_train = True
+        do_eval = False
+        kwargs = dict(stage=stage, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
+
+        # 1. normal training
+        output_dir = self.run_and_check(**kwargs)
+
+        # 2. now resume explicitly from the saved weights, by passing --model_name_or_path output_dir
+        # - i.e. the same path the model was saved to in step 1
+        output_dir = self.run_trainer(**kwargs, model_name=output_dir)
+
+        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
+
+    def do_checks(self, output_dir, do_train=True, do_eval=True):
+
+        if do_train:
+            train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
+            self.assertIn("train_samples_per_second", train_metrics)
+            self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
+
+        if do_eval:
+            eval_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
+            self.assertIn("eval_bleu", eval_metrics)
+            self.assertGreater(eval_metrics["eval_bleu"], 0)
+
+    # XXX: need to do better validation beyond just that the run was successful
+    def run_and_check(
+        self,
+        stage,
+        eval_steps=10,
+        distributed=True,
+        do_train=True,
+        do_eval=True,
+        extra_args_str=None,
+        remove_args_str=None,
+    ):
+
+        # we are doing quality testing so using a small real model
+        output_dir = self.run_trainer(
+            stage=stage,
+            model_name=T5_SMALL,
+            eval_steps=eval_steps,
+            num_train_epochs=1,
+            do_train=do_train,
+            do_eval=do_eval,
+            distributed=distributed,
+            extra_args_str=extra_args_str,
+            remove_args_str=remove_args_str,
+        )
+
+        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
+
+        return output_dir
+
+    def run_trainer(
+        self,
+        stage: str,
+        model_name: str,
+        eval_steps: int = 10,
+        num_train_epochs: int = 1,
+        do_train: bool = False,
+        do_eval: bool = True,
+        distributed: bool = True,
+        extra_args_str: str = None,
+        remove_args_str: str = None,
+    ):
+        max_len = 32
+        data_dir = self.examples_dir / "test_data/wmt_en_ro"
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_name_or_path {model_name}
+            --train_file {data_dir}/train.json
+            --validation_file {data_dir}/val.json
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --max_source_length {max_len}
+            --max_target_length {max_len}
+            --val_max_target_length {max_len}
+            --warmup_steps 8
+            --predict_with_generate
+            --logging_steps 0
+            --save_steps 0
+            --eval_steps {eval_steps}
+            --group_by_length
+            --label_smoothing_factor 0.1
+            --adafactor
+            --source_lang en
+            --target_lang ro
+        """.split()
+        args.extend(["--source_prefix", '"translate English to Romanian: "'])
+
+        actions = 0
+        if do_train:
+            actions += 1
+            args.extend(
+                f"""
+            --do_train
+            --num_train_epochs {str(num_train_epochs)}
+            --max_train_samples 100
+            --per_device_train_batch_size 2
+            --learning_rate 3e-3
+            """.split()
+            )
+
+        if do_eval:
+            actions += 1
+            args.extend(
+                """
+            --do_eval
+            --max_val_samples 100
+            --per_device_eval_batch_size 2
+            """.split()
+            )
+
+        assert actions > 0, "need at least do_train or do_eval for the test to run"
+
+        if extra_args_str is not None:
+            args.extend(extra_args_str.split())
+
+        # currently only works for bool args
+        if remove_args_str is not None:
+            remove_args = remove_args_str.split()
+            args = [x for x in args if x not in remove_args]
+
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
+        script = [f"{self.examples_dir_str}/seq2seq/run_translation.py"]
+        num_gpus = get_gpu_count() if distributed else 1
+        launcher = f"deepspeed --num_gpus {num_gpus}".split()
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        return output_dir
+
+    @parameterized.expand(stages)
+    def test_clm(self, stage):
+        # this test exercises model.resize_token_embeddings() which requires param gathering outside
+        # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
+
+        data_dir = self.tests_dir / "fixtures"
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_name_or_path sshleifer/tiny-gpt2
+            --train_file {data_dir}/sample_text.txt
+            --validation_file {data_dir}/sample_text.txt
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --do_train
+            --do_eval
+            --max_train_samples 10
+            --max_val_samples 10
+            --per_device_train_batch_size 5
+            --per_device_eval_batch_size 5
+            --num_train_epochs 1
+            --warmup_steps 8
+            --block_size 128
+            """.split()
+
+        distributed = True
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
+        script = [f"{self.examples_dir_str}/language-modeling/run_clm.py"]
+        num_gpus = get_gpu_count() if distributed else 1
+        launcher = f"deepspeed --num_gpus {num_gpus}".split()
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        return output_dir
diff --git a/examples/tests/trainer/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
similarity index 98%
rename from examples/tests/trainer/test_trainer_ext.py
rename to tests/extended/test_trainer_ext.py
index 82ec2f625cf0b1..6d13f9a4cced97 100644
--- a/examples/tests/trainer/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -21,6 +21,7 @@
 from transformers.file_utils import is_apex_available
 from transformers.integrations import is_fairscale_available
 from transformers.testing_utils import (
+    ExtendSysPath,
     TestCasePlus,
     execute_subprocess_async,
     get_gpu_count,
@@ -34,8 +35,8 @@
 
 
 bindir = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(f"{bindir}/../../seq2seq")
-from run_translation import main  # noqa
+with ExtendSysPath(f"{bindir}/../../examples/seq2seq"):
+    from run_translation import main  # noqa
 
 
 set_seed(42)
diff --git a/tests/fixtures/dummy_feature_extractor_config.json b/tests/fixtures/dummy_feature_extractor_config.json
new file mode 100644
index 00000000000000..cf0c5dce6c42b8
--- /dev/null
+++ b/tests/fixtures/dummy_feature_extractor_config.json
@@ -0,0 +1,3 @@
+{
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor"
+}
\ No newline at end of file
diff --git a/tests/sagemaker/README.md b/tests/sagemaker/README.md
index b3c9906cc5fcad..3d8ab7c2bfe02c 100644
--- a/tests/sagemaker/README.md
+++ b/tests/sagemaker/README.md
@@ -136,10 +136,7 @@ images:
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, 
       *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
 ```
-2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). 
-
-TODO: Add a screenshot of PR + Text template to make it easy to open. 
-
+2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1016), which information are needed.
 
 ## Current Tests
 
@@ -150,4 +147,4 @@ TODO: Add a screenshot of PR + Text template to make it easy to open.
 | pytorch-transfromers-test-2-smd     | test bert finetuning using BERT from transformer lib+ PT SM DDP   | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
 | pytorch-transfromers-test-1-smp     | test roberta finetuning using BERT from transformer lib+ PT SM MP | SageMaker createTrainingJob | 8     | train_runtime, eval_accuracy & eval_loss |
 | tensorflow-transfromers-test-single | Test bert finetuning using BERT from transformer lib+TF           | SageMaker createTrainingJob | 1     | train_runtime, eval_accuracy & eval_loss |
-| tensorflow-transfromers-test-2-smd  | test bert finetuning using BERT from transformer lib+ TF SM DDP   | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
\ No newline at end of file
+| tensorflow-transfromers-test-2-smd  | test bert finetuning using BERT from transformer lib+ TF SM DDP   | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
index 1bc9ed4ce82d15..1476a687a90a38 100644
--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -353,7 +353,7 @@ def main():
         if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
         else:
-            logger.warn(
+            logger.warning(
                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                 "\nIgnoring the model labels as a result.",
@@ -362,7 +362,7 @@ def main():
         label_to_id = {v: i for i, v in enumerate(label_list)}
 
     if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
         )
diff --git a/tests/sagemaker/test_multi_node_data_parallel.py b/tests/sagemaker/test_multi_node_data_parallel.py
index 67d8dcd70d3766..0a826f4b15a769 100644
--- a/tests/sagemaker/test_multi_node_data_parallel.py
+++ b/tests/sagemaker/test_multi_node_data_parallel.py
@@ -1,3 +1,4 @@
+import json
 import os
 import subprocess
 import unittest
@@ -11,7 +12,7 @@
 
 
 if is_sagemaker_available():
-    from sagemaker import TrainingJobAnalytics
+    from sagemaker import Session, TrainingJobAnalytics
     from sagemaker.huggingface import HuggingFace
 
 
@@ -27,21 +28,21 @@
             "script": "run_glue.py",
             "model_name_or_path": "distilbert-base-cased",
             "instance_type": "ml.p3dn.24xlarge",
-            "results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6},
+            "results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6},
         },
         {
             "framework": "pytorch",
             "script": "run_ddp.py",
             "model_name_or_path": "distilbert-base-cased",
             "instance_type": "ml.p3dn.24xlarge",
-            "results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6},
+            "results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6},
         },
         {
             "framework": "tensorflow",
             "script": "run_tf_dist.py",
             "model_name_or_path": "distilbert-base-cased",
             "instance_type": "ml.p3dn.24xlarge",
-            "results": {"train_runtime": 500, "eval_accuracy": 0.6, "eval_loss": 0.7},
+            "results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7},
         },
     ]
 )
@@ -88,17 +89,22 @@ def test_script(self, instance_count):
         # run training
         estimator.fit()
 
-        # save csv
-        self.save_results_as_csv(estimator.latest_training_job.name)
         # result dataframe
         result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
 
         # extract kpis
-        train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
         eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
         eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
+        # get train time from SageMaker job, this includes starting, preprocessing, stopping
+        train_runtime = (
+            Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
+        )
 
         # assert kpis
-        assert all(t <= self.results["train_runtime"] for t in train_runtime)
-        assert any(t >= self.results["eval_accuracy"] for t in eval_accuracy)
+        assert train_runtime <= self.results["train_runtime"]
+        assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
         assert all(t <= self.results["eval_loss"] for t in eval_loss)
+
+        # dump tests result into json file to share in PR
+        with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
+            json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
diff --git a/tests/sagemaker/test_multi_node_model_parallel.py b/tests/sagemaker/test_multi_node_model_parallel.py
index bca402bcba42f0..a59c207fb0edf9 100644
--- a/tests/sagemaker/test_multi_node_model_parallel.py
+++ b/tests/sagemaker/test_multi_node_model_parallel.py
@@ -1,4 +1,6 @@
+import json
 import os
+import subprocess
 import unittest
 from ast import literal_eval
 
@@ -10,7 +12,7 @@
 
 
 if is_sagemaker_available():
-    from sagemaker import TrainingJobAnalytics
+    from sagemaker import Session, TrainingJobAnalytics
     from sagemaker.huggingface import HuggingFace
 
 
@@ -26,12 +28,25 @@
             "script": "run_glue_model_parallelism.py",
             "model_name_or_path": "roberta-large",
             "instance_type": "ml.p3dn.24xlarge",
-            "results": {"train_runtime": 700, "eval_accuracy": 0.3, "eval_loss": 1.2},
+            "results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2},
+        },
+        {
+            "framework": "pytorch",
+            "script": "run_glue.py",
+            "model_name_or_path": "roberta-large",
+            "instance_type": "ml.p3dn.24xlarge",
+            "results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2},
         },
     ]
 )
 class MultiNodeTest(unittest.TestCase):
     def setUp(self):
+        if self.framework == "pytorch":
+            subprocess.run(
+                f"cp ./examples/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),
+                encoding="utf-8",
+                check=True,
+            )
         assert hasattr(self, "env")
 
     def create_estimator(self, instance_count):
@@ -55,13 +70,14 @@ def create_estimator(self, instance_count):
 
         distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}
 
+        name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer"
         # creates estimator
         return HuggingFace(
             entry_point=self.script,
             source_dir=self.env.test_path,
             role=self.env.role,
             image_uri=self.env.image_uri,
-            base_job_name=f"{self.env.base_job_name}-{instance_count}-smp",
+            base_job_name=f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}",
             instance_count=instance_count,
             instance_type=self.instance_type,
             debugger_hook_config=False,
@@ -87,17 +103,22 @@ def test_scripz(self, instance_count):
         # run training
         estimator.fit()
 
-        # save csv
-        self.save_results_as_csv(estimator.latest_training_job.name)
         # result dataframe
         result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
 
         # extract kpis
-        train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
         eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
         eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
+        # get train time from SageMaker job, this includes starting, preprocessing, stopping
+        train_runtime = (
+            Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
+        )
 
         # assert kpis
-        assert all(t <= self.results["train_runtime"] for t in train_runtime)
+        assert train_runtime <= self.results["train_runtime"]
         assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
         assert all(t <= self.results["eval_loss"] for t in eval_loss)
+
+        # dump tests result into json file to share in PR
+        with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
+            json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
diff --git a/tests/sagemaker/test_single_node_gpu.py b/tests/sagemaker/test_single_node_gpu.py
index aa08bd06419a85..71bf9d0928abd6 100644
--- a/tests/sagemaker/test_single_node_gpu.py
+++ b/tests/sagemaker/test_single_node_gpu.py
@@ -1,3 +1,4 @@
+import json
 import os
 import subprocess
 import unittest
@@ -11,7 +12,7 @@
 
 
 if is_sagemaker_available():
-    from sagemaker import TrainingJobAnalytics
+    from sagemaker import Session, TrainingJobAnalytics
     from sagemaker.huggingface import HuggingFace
 
 
@@ -27,14 +28,14 @@
             "script": "run_glue.py",
             "model_name_or_path": "distilbert-base-cased",
             "instance_type": "ml.g4dn.xlarge",
-            "results": {"train_runtime": 200, "eval_accuracy": 0.6, "eval_loss": 0.9},
+            "results": {"train_runtime": 650, "eval_accuracy": 0.6, "eval_loss": 0.9},
         },
         {
             "framework": "tensorflow",
             "script": "run_tf.py",
             "model_name_or_path": "distilbert-base-cased",
             "instance_type": "ml.g4dn.xlarge",
-            "results": {"train_runtime": 350, "eval_accuracy": 0.3, "eval_loss": 0.9},
+            "results": {"train_runtime": 600, "eval_accuracy": 0.3, "eval_loss": 0.9},
         },
     ]
 )
@@ -74,17 +75,22 @@ def test_glue(self):
         # run training
         estimator.fit()
 
-        # save csv
-        self.save_results_as_csv(estimator.latest_training_job.name)
         # result dataframe
         result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
 
         # extract kpis
-        train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
         eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
         eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
+        # get train time from SageMaker job, this includes starting, preprocessing, stopping
+        train_runtime = (
+            Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
+        )
 
         # assert kpis
-        assert all(t <= self.results["train_runtime"] for t in train_runtime)
+        assert train_runtime <= self.results["train_runtime"]
         assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
         assert all(t <= self.results["eval_loss"] for t in eval_loss)
+
+        # dump tests result into json file to share in PR
+        with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
+            json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
diff --git a/tests/test_data_collator.py b/tests/test_data_collator.py
index be138314d330bb..e9d363229f6e03 100644
--- a/tests/test_data_collator.py
+++ b/tests/test_data_collator.py
@@ -146,11 +146,8 @@ def test_data_collator_for_token_classification(self):
         self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
         self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
 
-    def test_data_collator_for_language_modeling(self):
+    def _test_no_pad_and_pad(self, no_pad_features, pad_features):
         tokenizer = BertTokenizer(self.vocab_file)
-        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
-        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
-
         data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
         batch = data_collator(no_pad_features)
         self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
@@ -160,6 +157,15 @@ def test_data_collator_for_language_modeling(self):
         self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
         self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
 
+        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
         tokenizer._pad_token = None
         data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
         with self.assertRaises(ValueError):
@@ -185,6 +191,32 @@ def test_data_collator_for_language_modeling(self):
         self.assertTrue(torch.any(masked_tokens))
         self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
 
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+    def test_data_collator_for_language_modeling(self):
+        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
+        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
+        no_pad_features = [list(range(10)), list(range(10))]
+        pad_features = [list(range(5)), list(range(10))]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
     def test_plm(self):
         tokenizer = BertTokenizer(self.vocab_file)
         no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
@@ -225,6 +257,14 @@ def test_nsp(self):
         self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
         self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
 
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
+
     def test_sop(self):
         tokenizer = BertTokenizer(self.vocab_file)
         features = [
@@ -242,3 +282,11 @@ def test_sop(self):
         self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
         self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
         self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
diff --git a/tests/test_feature_extraction_auto.py b/tests/test_feature_extraction_auto.py
new file mode 100644
index 00000000000000..71ee32c230af38
--- /dev/null
+++ b/tests/test_feature_extraction_auto.py
@@ -0,0 +1,44 @@
+# coding=utf-8
+# Copyright 2021 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor, Wav2Vec2FeatureExtractor
+
+
+SAMPLE_FEATURE_EXTRACTION_CONFIG = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "fixtures/dummy_feature_extractor_config.json"
+)
+
+
+class AutoFeatureExtractorTest(unittest.TestCase):
+    def test_feature_extractor_from_model_shortcut(self):
+        config = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+        self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
+
+    def test_feature_extractor_from_local_file(self):
+        config = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG)
+        self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
+
+    def test_pattern_matching_fallback(self):
+        """
+        In cases where config.json doesn't include a model_type,
+        perform a few safety checks on the config mapping's order.
+        """
+        # no key string should be included in a later key string (typical failure case)
+        keys = list(FEATURE_EXTRACTOR_MAPPING.keys())
+        for i, key in enumerate(keys):
+            self.assertFalse(any(key in later_key for later_key in keys[i + 1 :]))
diff --git a/tests/test_feature_extraction_speech_to_text.py b/tests/test_feature_extraction_speech_to_text.py
index 5cd2f67f457d5f..c90beef01377dc 100644
--- a/tests/test_feature_extraction_speech_to_text.py
+++ b/tests/test_feature_extraction_speech_to_text.py
@@ -20,12 +20,15 @@
 
 import numpy as np
 
-from transformers import Speech2TextFeatureExtractor
+from transformers import is_speech_available
 from transformers.testing_utils import require_torch, require_torchaudio
 
 from .test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 
 
+if is_speech_available():
+    from transformers import Speech2TextFeatureExtractor
+
 global_rng = random.Random()
 
 
@@ -101,7 +104,7 @@ def _flatten(list_of_lists):
 @require_torchaudio
 class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
 
-    feature_extraction_class = Speech2TextFeatureExtractor
+    feature_extraction_class = Speech2TextFeatureExtractor if is_speech_available() else None
 
     def setUp(self):
         self.feat_extract_tester = Speech2TextFeatureExtractionTester(self)
diff --git a/tests/test_logging.py b/tests/test_logging.py
index f85fe260ca0e06..d0633bfbe41717 100644
--- a/tests/test_logging.py
+++ b/tests/test_logging.py
@@ -51,7 +51,7 @@ def test_integration(self):
         # should be able to log warnings (if default settings weren't overridden by `pytest --log-level-all`)
         if level_origin <= logging.WARNING:
             with CaptureLogger(logger) as cl:
-                logger.warn(msg)
+                logger.warning(msg)
             self.assertEqual(cl.out, msg + "\n")
 
         # this is setting the level for all of `transformers.*` loggers
@@ -59,7 +59,7 @@ def test_integration(self):
 
         # should not be able to log warnings
         with CaptureLogger(logger) as cl:
-            logger.warn(msg)
+            logger.warning(msg)
         self.assertEqual(cl.out, "")
 
         # should be able to log warnings again
diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py
index 1859f51aa5c33d..7f82c67ba088ac 100644
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -234,7 +235,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_auto.py b/tests/test_modeling_auto.py
index d395d9640d758c..0ba839c42ade80 100644
--- a/tests/test_modeling_auto.py
+++ b/tests/test_modeling_auto.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import copy
+import tempfile
 import unittest
 
 from transformers import is_torch_available
@@ -46,6 +47,8 @@
         BertForSequenceClassification,
         BertForTokenClassification,
         BertModel,
+        FunnelBaseModel,
+        FunnelModel,
         GPT2Config,
         GPT2LMHeadModel,
         RobertaForMaskedLM,
@@ -218,6 +221,21 @@ def test_from_identifier_from_model_type(self):
         self.assertEqual(model.num_parameters(), 14410)
         self.assertEqual(model.num_parameters(only_trainable=True), 14410)
 
+    def test_from_pretrained_with_tuple_values(self):
+        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
+        model = AutoModel.from_pretrained("sgugger/funnel-random-tiny")
+        self.assertIsInstance(model, FunnelModel)
+
+        config = copy.deepcopy(model.config)
+        config.architectures = ["FunnelBaseModel"]
+        model = AutoModel.from_config(config)
+        self.assertIsInstance(model, FunnelBaseModel)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            model = AutoModel.from_pretrained(tmp_dir)
+            self.assertIsInstance(model, FunnelBaseModel)
+
     def test_parents_and_children_in_mappings(self):
         # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
         # by the parents and will return the wrong configuration type when using auto models
@@ -242,6 +260,12 @@ def test_parents_and_children_in_mappings(self):
                     assert not issubclass(
                         child_config, parent_config
                     ), f"{child_config.__name__} is child of {parent_config.__name__}"
-                    assert not issubclass(
-                        child_model, parent_model
-                    ), f"{child_config.__name__} is child of {parent_config.__name__}"
+
+                    # Tuplify child_model and parent_model since some of them could be tuples.
+                    if not isinstance(child_model, (list, tuple)):
+                        child_model = (child_model,)
+                    if not isinstance(parent_model, (list, tuple)):
+                        parent_model = (parent_model,)
+
+                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
+                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py
index 03f76c264babe9..97da4350ab7c2c 100755
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -444,7 +445,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_big_bird.py b/tests/test_modeling_big_bird.py
index 340708218327c4..edef01f207a511 100644
--- a/tests/test_modeling_big_bird.py
+++ b/tests/test_modeling_big_bird.py
@@ -19,6 +19,7 @@
 
 from tests.test_modeling_common import floats_tensor
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
 from transformers.testing_utils import require_torch, slow, torch_device
 
@@ -458,7 +459,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
@@ -788,7 +789,7 @@ def test_tokenizer_inference(self):
         model.to(torch_device)
 
         text = [
-            'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth ... This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth ,, I was born in 92000, and this is falsé.'
+            "Transformer-based models are unable to process long sequences due to their self-attention operation, which scales quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or longer. Longformer’s attention mechanism is a drop-in replacement for the standard self-attention and combines a local windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on WikiHop and TriviaQA."
         ]
         inputs = tokenizer(text)
 
@@ -798,22 +799,22 @@ def test_tokenizer_inference(self):
         prediction = model(**inputs)
         prediction = prediction[0]
 
-        self.assertEqual(prediction.shape, torch.Size((1, 128, 768)))
+        self.assertEqual(prediction.shape, torch.Size((1, 199, 768)))
 
         expected_prediction = torch.tensor(
             [
-                [-0.0745, 0.0689, -0.1126, -0.0610],
-                [-0.0343, 0.0111, -0.0269, -0.0858],
-                [0.1150, 0.0896, 0.0492, 0.0149],
-                [-0.0657, 0.2035, 0.0444, -0.0535],
-                [0.1143, 0.0465, 0.1583, -0.1855],
-                [-0.0216, 0.0807, 0.0536, 0.1371],
-                [-0.1879, 0.0097, -0.1916, 0.1701],
-                [0.7616, 0.1240, 0.0669, 0.2588],
-                [0.1096, -0.1810, -0.1987, 0.0445],
-                [0.1810, -0.3608, -0.0081, 0.1764],
-                [-0.0472, 0.0460, 0.0976, -0.0021],
-                [-0.0274, -0.3274, -0.0788, 0.0465],
+                [-0.0213, -0.2213, -0.0061, 0.0687],
+                [0.0977, 0.1858, 0.2374, 0.0483],
+                [0.2112, -0.2524, 0.5793, 0.0967],
+                [0.2473, -0.5070, -0.0630, 0.2174],
+                [0.2885, 0.1139, 0.6071, 0.2991],
+                [0.2328, -0.2373, 0.3648, 0.1058],
+                [0.2517, -0.0689, 0.0555, 0.0880],
+                [0.1021, -0.1495, -0.0635, 0.1891],
+                [0.0591, -0.0722, 0.2243, 0.2432],
+                [-0.2059, -0.2679, 0.3225, 0.6183],
+                [0.2280, -0.2618, 0.1693, 0.0103],
+                [0.0183, -0.1375, 0.2284, -0.1707],
             ],
             device=torch_device,
         )
@@ -826,11 +827,11 @@ def test_inference_question_answering(self):
         )
         model.to(torch_device)
 
-        context = "🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch. Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a question answering dataset is the SQuAD dataset"
+        context = "The BigBird model was proposed in Big Bird: Transformers for Longer Sequences by Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it has been shown that applying sparse, global, and random attention approximates full attention, while being computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context, BigBird has shown improved performance on various long document NLP tasks, such as question answering and summarization, compared to BERT or RoBERTa."
 
         question = [
-            "How many pretrained models are available in 🤗 Transformers?",
-            "🤗 Transformers provides interoperability between which frameworks?",
+            "Which is better for longer sequences- BigBird or BERT?",
+            "What is the benefit of using BigBird over BERT?",
         ]
         inputs = tokenizer(
             question,
@@ -838,7 +839,7 @@ def test_inference_question_answering(self):
             padding=True,
             return_tensors="pt",
             add_special_tokens=True,
-            max_length=128,
+            max_length=256,
             truncation=True,
         )
 
@@ -848,11 +849,11 @@ def test_inference_question_answering(self):
 
         # fmt: off
         target_start_logits = torch.tensor(
-            [[-9.5889, -10.2121, -14.2158, -11.1457, -10.7376, -7.3907, -10.2084, -9.5659, -15.0336, -8.6686, -9.1737, -11.1457, -13.4722, -6.3336, -9.6311, -8.4821, -15.141, -9.1226, -10.3328, -11.1457, -6.6793, -3.9627, 2.7126, -5.5607, -8.4625, -12.499, -11.4757, -9.6334, -4.0565, -10.0474, -7.4126, -13.5669], [-15.3796, -12.6863, -10.3951, -7.6706, -10.1808, -11.4401, -15.5868, -12.7959, -11.0186, -12.6863, -14.2198, -8.1182, -11.1353, -11.6512, -15.702, -12.8964, -12.5173, -12.6863, -14.4133, -13.1532, -12.2846, -14.1572, -11.2747, -11.1159, -11.5219, -13.1115, -11.8779, -13.989, -11.5234, -15.0459, -10.0178, -12.9253]],  # noqa: E231
+            [[-8.9304, -10.3849, -14.4997, -9.6497, -13.9469, -7.8134, -8.9687, -13.3585, -9.7987, -13.8869, -9.2632, -8.9294, -13.6721, -7.3198, -9.5434, -11.2641, -14.3245, -9.5705, -12.7367, -8.6168, -11.083, -13.7573, -8.1151, -14.5329, -7.6876, -15.706, -12.8558, -9.1135, 8.0909, -3.1925, -11.5812, -9.4822], [-11.5595, -14.5591, -10.2978, -14.8445, -10.2092, -11.1899, -13.8356, -10.5644, -14.7706, -9.9841, -11.0052, -14.1862, -8.8173, -11.1098, -12.4686, -15.0531, -11.0196, -13.6614, -10.0236, -11.8151, -14.8744, -9.5123, -15.1605, -8.6472, -15.4184, -8.898, -9.6328, -7.0258, -11.3365, -14.4065, -10.2587, -8.9103]],  # noqa: E231
             device=torch_device,
         )
         target_end_logits = torch.tensor(
-            [[-12.4895, -10.9826, -13.8226, -11.9922, -13.2647, -12.4584, -10.6143, -9.4091, -16.844, -14.0393, -9.5914, -11.9922, -15.5142, -11.4073, -10.1064, -8.3961, -16.4374, -13.9323, -10.791, -11.9922, -8.736, -9.5672, 0.2844, -4.0976, -13.849, -11.8035, -12.7784, -14.1314, -7.4138, -10.5488, -8.0133, -14.8779], [-14.9831, -13.4818, -13.1566, -12.7259, -10.5892, -10.8605, -17.2376, -15.9398, -12.8739, -13.4818, -16.6979, -13.3403, -11.6416, -11.392, -16.9553, -15.723, -13.2643, -13.4818, -16.2067, -15.6688, -15.0449, -15.1253, -15.1373, -12.385, -13.3652, -15.9473, -14.9587, -15.5024, -13.1482, -16.6358, -12.3908, -15.7493]],  # noqa: E231
+            [[-12.4131, -8.5959, -15.7163, -11.1524, -15.9913, -12.2038, -7.8902, -16.0296, -12.164, -16.5017, -13.3332, -6.9488, -15.7756, -13.8506, -11.0779, -9.2893, -15.0426, -10.1963, -17.3292, -12.2945, -11.5337, -16.4514, -9.1564, -17.5001, -9.1562, -16.2971, -13.3199, -7.5724, -5.1175, 7.2168, -10.3804, -11.9873], [-10.8654, -14.9967, -11.4144, -16.9189, -14.2673, -9.7068, -15.0182, -12.8846, -16.8716, -13.665, -10.3113, -15.1436, -14.9069, -13.3364, -11.2339, -16.0118, -11.8331, -17.0613, -13.8852, -12.4163, -16.8978, -10.7772, -17.2324, -10.6979, -16.9811, -10.3427, -9.497, -13.7104, -11.1107, -13.2936, -13.855, -14.1264]],  # noqa: E231
             device=torch_device,
         )
         # fmt: on
@@ -867,7 +868,7 @@ def test_inference_question_answering(self):
         ]
         answer = tokenizer.batch_decode(answer)
 
-        self.assertTrue(answer == ["32", "[SEP]"])
+        self.assertTrue(answer == ["BigBird", "global attention"])
 
     def test_fill_mask(self):
         tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 9ce171e6493887..d5d76162bc0fd0 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -24,6 +24,7 @@
 
 from transformers import is_torch_available
 from transformers.file_utils import WEIGHTS_NAME
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, slow, torch_device
 
 
@@ -79,7 +80,7 @@ class ModelTesterMixin:
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
                 k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
                 if isinstance(v, torch.Tensor) and v.ndim > 1
@@ -88,9 +89,9 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
             }
 
         if return_labels:
-            if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["start_positions"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
@@ -98,18 +99,18 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
             elif model_class in [
-                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
-                *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.values(),
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
             elif model_class in [
-                *MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_CAUSAL_LM_MAPPING.values(),
-                *MODEL_FOR_MASKED_LM_MAPPING.values(),
-                *MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
@@ -229,7 +230,7 @@ def test_training(self):
         config.return_dict = True
 
         for model_class in self.all_model_classes:
-            if model_class in MODEL_MAPPING.values():
+            if model_class in get_values(MODEL_MAPPING):
                 continue
             model = model_class(config)
             model.to(torch_device)
@@ -248,7 +249,7 @@ def test_training_gradient_checkpointing(self):
         config.return_dict = True
 
         for model_class in self.all_model_classes:
-            if model_class in MODEL_MAPPING.values():
+            if model_class in get_values(MODEL_MAPPING):
                 continue
             model = model_class(config)
             model.to(torch_device)
@@ -312,7 +313,7 @@ def test_attention_outputs(self):
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Question Answering model returns start_logits and end_logits
-                if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                     correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
diff --git a/tests/test_modeling_convbert.py b/tests/test_modeling_convbert.py
index 610affc45157eb..062a7f506a996f 100644
--- a/tests/test_modeling_convbert.py
+++ b/tests/test_modeling_convbert.py
@@ -19,6 +19,7 @@
 
 from tests.test_modeling_common import floats_tensor
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -352,7 +353,7 @@ def test_attention_outputs(self):
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Question Answering model returns start_logits and end_logits
-                if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                     correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py
index 88138a587ccd1a..5935eafee668c0 100644
--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -292,7 +293,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_flax_bert.py b/tests/test_modeling_flax_bert.py
index fc339f7501b7cf..273f55d157d241 100644
--- a/tests/test_modeling_flax_bert.py
+++ b/tests/test_modeling_flax_bert.py
@@ -29,6 +29,7 @@
         FlaxBertForNextSentencePrediction,
         FlaxBertForPreTraining,
         FlaxBertForQuestionAnswering,
+        FlaxBertForSequenceClassification,
         FlaxBertForTokenClassification,
         FlaxBertModel,
     )
@@ -125,6 +126,7 @@ class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
             FlaxBertForMultipleChoice,
             FlaxBertForQuestionAnswering,
             FlaxBertForNextSentencePrediction,
+            FlaxBertForSequenceClassification,
             FlaxBertForTokenClassification,
             FlaxBertForQuestionAnswering,
         )
diff --git a/tests/test_modeling_funnel.py b/tests/test_modeling_funnel.py
index 0e3846cef147c1..4435359eb68fb0 100644
--- a/tests/test_modeling_funnel.py
+++ b/tests/test_modeling_funnel.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import FunnelTokenizer, is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -365,7 +366,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_gpt_neo.py b/tests/test_modeling_gpt_neo.py
index 023a9d265edfdb..14d966d61b4bce 100644
--- a/tests/test_modeling_gpt_neo.py
+++ b/tests/test_modeling_gpt_neo.py
@@ -18,6 +18,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -35,6 +36,7 @@
         GPTNeoForCausalLM,
         GPTNeoModel,
     )
+    from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoAttentionMixin, GPTNeoLocalSelfAttention
 
 
 class GPTNeoModelTester:
@@ -430,11 +432,164 @@ def _check_attentions_for_generate(
             # check attn size
             self.assertListEqual(shapes, expected_shape)
 
+
+@require_torch
+class GPTNeoLocalAttentionTest(unittest.TestCase):
+    def _get_hidden_states(self):
+        return torch.tensor(
+            [
+                [
+                    [0.4983, -0.7584, -1.6944, 0.5440],
+                    [2.6918, 0.4206, 0.4176, 0.2055],
+                    [-0.0071, -0.0405, -1.4920, -0.3630],
+                    [1.0492, 0.1599, -1.7648, 0.2419],
+                    [-1.8348, 2.0514, -0.1946, 0.3203],
+                    [0.7672, -1.1600, -1.7118, -0.9056],
+                    [0.2986, 0.5372, 0.7729, -0.1927],
+                    [0.0285, 0.2629, -1.1156, -1.1992],
+                ]
+            ],
+            dtype=torch.float32,
+            device=torch_device,
+        )
+
+    def test_look_back(self):
+        hidden_states = self._get_hidden_states()
+        batch_size, seq_length, hidden_size = hidden_states.shape
+
+        # check when seq_length is divisible by window_size
+        window_size = 4
+        block_length, num_block = GPTNeoAttentionMixin._get_block_length_and_num_blocks(seq_length, window_size)
+        blocked_hidden_states = GPTNeoAttentionMixin._look_back(hidden_states, block_length, window_size)
+        expected_shape = [batch_size, num_block, window_size + block_length, hidden_size]
+        self.assertListEqual(list(blocked_hidden_states.shape), expected_shape)
+        # The last block should contain the last (window_size + block_length) hidden_states
+        self.assertTrue(
+            torch.all(blocked_hidden_states[:, -1, ...] == hidden_states[:, -(window_size + block_length) :, ...])
+        )
+
+        # check when seq_length is not divisible by window_size
+        window_size = 3
+        block_length, num_block = GPTNeoAttentionMixin._get_block_length_and_num_blocks(seq_length, window_size)
+        blocked_hidden_states = GPTNeoAttentionMixin._look_back(hidden_states, block_length, window_size)
+        expected_shape = [batch_size, num_block, window_size + block_length, hidden_size]
+        self.assertListEqual(list(blocked_hidden_states.shape), expected_shape)
+        # The last block should contain the last (window_size + block_length) hidden_states
+        self.assertTrue(
+            torch.all(blocked_hidden_states[:, -1, ...] == hidden_states[:, -(window_size + block_length) :, ...])
+        )
+
+        # check when window_size is > seq_length
+        window_size = 19
+        block_length, num_block = GPTNeoAttentionMixin._get_block_length_and_num_blocks(seq_length, window_size)
+        blocked_hidden_states = GPTNeoAttentionMixin._look_back(hidden_states, block_length, window_size)
+        expected_shape = [batch_size, num_block, window_size + block_length, hidden_size]
+        self.assertListEqual(list(blocked_hidden_states.shape), expected_shape)
+
+        # when window_size > seq_length, num_blocks becomes 1, in this case
+        # the first window_size values in blocked_hidden_staes are all zeros
+        # and the last block_length values are equal to the hidden_states
+        values = blocked_hidden_states[:, -1, :window_size, ...]
+        expected_values = torch.zeros_like(values)
+        self.assertTrue(torch.all(values == expected_values))
+
+        self.assertTrue(torch.all(blocked_hidden_states[:, -1, -block_length:, ...] == hidden_states))
+
+    def test_create_attention_mask(self):
+        config = GPTNeoConfig.from_pretrained("valhalla/gpt-neo-random-tiny")
+        layer = GPTNeoLocalSelfAttention(config)
+        window_size = config.window_size
+        batch_size, seq_length = 8, 1
+        block_length, num_blocks = GPTNeoAttentionMixin._get_block_length_and_num_blocks(seq_length, window_size)
+
+        causal_mask = layer._create_attention_mask(batch_size, seq_length, num_blocks, block_length, torch_device)
+        # check shapes
+        expected_shape = [batch_size, num_blocks, 1, block_length, window_size + block_length]
+        self.assertListEqual(list(causal_mask.shape), expected_shape)
+        # first window_size tokens in the first block are always padded
+        # and should not be attended
+        self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0))
+        # each window can attend at most window_size tokens
+        self.assertTrue(torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))
+
+        # check if user provided attention_mask is handled correctly
+        attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long, device=torch_device)
+        attention_mask[:, -3:] = 0  # don't attend last 3 tokens
+
+        causal_mask = layer._create_attention_mask(
+            batch_size, seq_length, num_blocks, block_length, torch_device, attention_mask
+        )
+        # last 3 tokens will be in the last block and shoul have 0s in causal_mask
+        self.assertTrue(torch.all(causal_mask[:, -1, :, :, -3:] == 0))
+        # check shapes
+        expected_shape = [batch_size, num_blocks, 1, block_length, window_size + block_length]
+        self.assertListEqual(list(causal_mask.shape), expected_shape)
+        # first window_size tokens in the first block are always padded
+        # and should not be attended
+        self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0))
+        # each window can attend at most window_size tokens
+        self.assertTrue(torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))
+
+    def test_local_attn_probs(self):
+        model = GPTNeoModel.from_pretrained("valhalla/gpt-neo-random-tiny").eval()
+        layer = model.h[1].attn.attention.to(torch_device)
+        hidden_states = self._get_hidden_states()
+        hidden_states = torch.cat([hidden_states, hidden_states - 0.5], dim=2)
+        batch_size, seq_length, hidden_size = hidden_states.shape
+        mask_tokens = 3
+        attention_mask = torch.ones(batch_size, seq_length, device=torch_device, dtype=torch.long)
+        attention_mask[:, -mask_tokens:] = 0  # dont atten last mask_tokens
+
+        _, attn_probs = layer(hidden_states, attention_mask=attention_mask, output_attentions=True)
+
+        # the last 3 tokens will be in the last block, and should have 0 attn_probs
+        self.assertTrue(torch.all(attn_probs[:, -1, :, -mask_tokens:, -mask_tokens:] == 0))
+        # the first config.window_size tokens in the first block are always padded
+        # and should have 0 attn_probs
+        self.assertTrue(torch.all(attn_probs[:, 0, :, : model.config.window_size :, : model.config.window_size] == 0))
+
+
+@require_torch
+class GPTNeoModelLanguageGenerationTest(unittest.TestCase):
+    @cached_property
+    def model(self):
+        return GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").to(torch_device)
+
+    @cached_property
+    def tokenizer(self):
+        return GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
+
+    @slow
+    def test_lm_generate_gpt_neo(self):
+        for checkpointing in [True, False]:
+            model = self.model
+            model.config.gradient_checkpointing = checkpointing
+            input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)  # The dog
+            # fmt: off
+            # The dog-eared copy of the book, which is a collection of essays by the late author,
+            expected_output_ids = [464, 3290, 12, 3380, 4866, 286, 262, 1492, 11, 543, 318, 257, 4947, 286, 27126, 416, 262, 2739, 1772, 11]
+            # fmt: on
+            output_ids = model.generate(input_ids, do_sample=False)
+            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
+
+    @slow
+    def test_gpt_neo_sample(self):
+        model = self.model
+        tokenizer = self.tokenizer
+
+        torch.manual_seed(0)
+        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
+        input_ids = tokenized.input_ids.to(torch_device)
+        output_ids = model.generate(input_ids, do_sample=True)
+        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+        EXPECTED_OUTPUT_STR = "Today is a nice day and if you don’t get the memo here is what you can"
+        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
+
     @slow
     def test_batch_generation(self):
-        model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
-        model.to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = self.model
+        tokenizer = self.tokenizer
 
         tokenizer.padding_side = "left"
 
@@ -479,33 +634,3 @@ def test_model_from_pretrained(self):
         for model_name in GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = GPTNeoModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
-
-
-@require_torch
-class GPTNeoModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_gpt_neo(self):
-        for checkpointing in [True, False]:
-            model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", gradient_checkpointing=checkpointing)
-            model.to(torch_device)
-            input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)  # The dog
-            # fmt: off
-            expected_output_ids = [464, 3290, 12, 3380, 4866, 286, 262, 1492, 11, 543, 318, 257, 4947, 286, 27126, 416, 262, 2739, 1772, 11]  # The dog-eared copy of the book, which is a collection of essays by the late author,
-            # fmt: on
-            output_ids = model.generate(input_ids, do_sample=False)
-            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
-
-    @slow
-    def test_gpt_neo_sample(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
-        model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
-        input_ids = tokenized.input_ids.to(torch_device)
-        output_ids = model.generate(input_ids, do_sample=True)
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        EXPECTED_OUTPUT_STR = "Today is a nice day and if you don’t get the memo here is what you can"
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
diff --git a/tests/test_modeling_led.py b/tests/test_modeling_led.py
index 416606014575c7..caffe199bb2b14 100644
--- a/tests/test_modeling_led.py
+++ b/tests/test_modeling_led.py
@@ -21,6 +21,7 @@
 
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -412,7 +413,7 @@ def test_attention_outputs(self):
             if "labels" in inputs_dict:
                 correct_outlen += 1  # loss is added to beginning
             # Question Answering model returns start_logits and end_logits
-            if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 correct_outlen += 1  # start_logits and end_logits instead of only 1 output
             if "past_key_values" in outputs:
                 correct_outlen += 1  # past_key_values have been returned
diff --git a/tests/test_modeling_lxmert.py b/tests/test_modeling_lxmert.py
index f05b3c3ee85e6b..b03cc31335d903 100644
--- a/tests/test_modeling_lxmert.py
+++ b/tests/test_modeling_lxmert.py
@@ -18,6 +18,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -532,11 +533,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
 
         if return_labels:
-            if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
-            elif model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 # special case for models like BERT that use multi-loss training for PreTraining
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
diff --git a/tests/test_modeling_megatron_bert.py b/tests/test_modeling_megatron_bert.py
new file mode 100644
index 00000000000000..5be4716d335be3
--- /dev/null
+++ b/tests/test_modeling_megatron_bert.py
@@ -0,0 +1,378 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MegatronBERT model. """
+
+
+import math
+import os
+import unittest
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        MegatronBertConfig,
+        MegatronBertForCausalLM,
+        MegatronBertForMaskedLM,
+        MegatronBertForMultipleChoice,
+        MegatronBertForNextSentencePrediction,
+        MegatronBertForPreTraining,
+        MegatronBertForQuestionAnswering,
+        MegatronBertForSequenceClassification,
+        MegatronBertForTokenClassification,
+        MegatronBertModel,
+    )
+
+
+class MegatronBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        embedding_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding_size = embedding_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = MegatronBertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            embedding_size=self.embedding_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_megatron_bert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_megatron_bert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_causal_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_megatron_bert_for_next_sequence_prediction(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForNextSentencePrediction(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=sequence_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
+
+    def create_and_check_megatron_bert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            next_sentence_label=sequence_labels,
+        )
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
+
+    def create_and_check_megatron_bert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_megatron_bert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MegatronBertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_megatron_bert_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MegatronBertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_megatron_bert_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MegatronBertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            MegatronBertModel,
+            MegatronBertForMaskedLM,
+            MegatronBertForCausalLM,
+            MegatronBertForMultipleChoice,
+            MegatronBertForNextSentencePrediction,
+            MegatronBertForPreTraining,
+            MegatronBertForQuestionAnswering,
+            MegatronBertForSequenceClassification,
+            MegatronBertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    # test_resize_embeddings = False
+    test_head_masking = False
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["next_sentence_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = MegatronBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MegatronBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_megatron_bert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_token_classification(*config_and_inputs)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(
+        tok_lst,
+        dtype=torch.long,
+        device=torch_device,
+    )
+
+
+TOLERANCE = 1e-4
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MegatronBertModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        directory = "nvidia/megatron-bert-uncased-345m"
+        if "MYDIR" in os.environ:
+            directory = os.path.join(os.environ["MYDIR"], directory)
+        model = MegatronBertModel.from_pretrained(directory)
+        model.to(torch_device)
+        model.half()
+        input_ids = _long_tensor([[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 9, 1024))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected = [-0.6040, -0.2517, -0.1025, 0.3420, -0.6758, -0.0017, -0.1089, -0.1990, 0.5728]
+        for ii in range(3):
+            for jj in range(3):
+                a = output[0, ii, jj]
+                b = expected[3 * ii + jj]
+                msg = "ii={} jj={} a={} b={}".format(ii, jj, a, b)
+                self.assertTrue(math.isclose(a, b, rel_tol=TOLERANCE, abs_tol=TOLERANCE), msg=msg)
diff --git a/tests/test_modeling_mobilebert.py b/tests/test_modeling_mobilebert.py
index 9a0fc9ae96e44d..96c974e2edc534 100644
--- a/tests/test_modeling_mobilebert.py
+++ b/tests/test_modeling_mobilebert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -272,7 +273,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_tapas.py b/tests/test_modeling_tapas.py
index b4f8f1323184e3..b36147d5586f36 100644
--- a/tests/test_modeling_tapas.py
+++ b/tests/test_modeling_tapas.py
@@ -32,6 +32,7 @@
     is_torch_available,
 )
 from transformers.file_utils import cached_property
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_scatter, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -425,7 +426,7 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
                 k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
                 if isinstance(v, torch.Tensor) and v.ndim > 1
@@ -434,9 +435,9 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
             }
 
         if return_labels:
-            if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class in MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING.values():
+            elif model_class in get_values(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
@@ -457,17 +458,17 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                     self.model_tester.batch_size, dtype=torch.float, device=torch_device
                 )
             elif model_class in [
-                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
             elif model_class in [
-                *MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_CAUSAL_LM_MAPPING.values(),
-                *MODEL_FOR_MASKED_LM_MAPPING.values(),
-                *MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
diff --git a/tests/test_modeling_tf_albert.py b/tests/test_modeling_tf_albert.py
index aabd185f7837ea..ab6b32ab849599 100644
--- a/tests/test_modeling_tf_albert.py
+++ b/tests/test_modeling_tf_albert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import AlbertConfig, is_tf_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_tf, slow
 
 from .test_configuration_common import ConfigTester
@@ -249,7 +250,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["sentence_order_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
 
         return inputs_dict
diff --git a/tests/test_modeling_tf_auto.py b/tests/test_modeling_tf_auto.py
index ff80adc369c47d..eb0b05f2c7da38 100644
--- a/tests/test_modeling_tf_auto.py
+++ b/tests/test_modeling_tf_auto.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import copy
+import tempfile
 import unittest
 
 from transformers import is_tf_available
@@ -39,6 +40,8 @@
         TFBertForQuestionAnswering,
         TFBertForSequenceClassification,
         TFBertModel,
+        TFFunnelBaseModel,
+        TFFunnelModel,
         TFGPT2LMHeadModel,
         TFRobertaForMaskedLM,
         TFT5ForConditionalGeneration,
@@ -176,6 +179,21 @@ def test_from_identifier_from_model_type(self):
         self.assertEqual(model.num_parameters(), 14410)
         self.assertEqual(model.num_parameters(only_trainable=True), 14410)
 
+    def test_from_pretrained_with_tuple_values(self):
+        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
+        model = TFAutoModel.from_pretrained("sgugger/funnel-random-tiny")
+        self.assertIsInstance(model, TFFunnelModel)
+
+        config = copy.deepcopy(model.config)
+        config.architectures = ["FunnelBaseModel"]
+        model = TFAutoModel.from_config(config)
+        self.assertIsInstance(model, TFFunnelBaseModel)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            model = TFAutoModel.from_pretrained(tmp_dir)
+            self.assertIsInstance(model, TFFunnelBaseModel)
+
     def test_parents_and_children_in_mappings(self):
         # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
         # by the parents and will return the wrong configuration type when using auto models
@@ -197,4 +215,12 @@ def test_parents_and_children_in_mappings(self):
                 for parent_config, parent_model in mapping[: index + 1]:
                     with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
                         self.assertFalse(issubclass(child_config, parent_config))
-                        self.assertFalse(issubclass(child_model, parent_model))
+
+                    # Tuplify child_model and parent_model since some of them could be tuples.
+                    if not isinstance(child_model, (list, tuple)):
+                        child_model = (child_model,)
+                    if not isinstance(parent_model, (list, tuple)):
+                        parent_model = (parent_model,)
+
+                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
+                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py
index 8817ae2bc1ce51..639ba0be9d7397 100644
--- a/tests/test_modeling_tf_bert.py
+++ b/tests/test_modeling_tf_bert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import BertConfig, is_tf_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_tf, slow
 
 from .test_configuration_common import ConfigTester
@@ -282,7 +283,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
 
         return inputs_dict
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index a2f708566060a9..51daf3779dc593 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -25,6 +25,7 @@
 from typing import List, Tuple
 
 from transformers import is_tf_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import (
     _tf_gpu_memory_limit,
     is_pt_tf_cross_test,
@@ -89,7 +90,7 @@ class TFModelTesterMixin:
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
         inputs_dict = copy.deepcopy(inputs_dict)
 
-        if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
                 k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
                 if isinstance(v, tf.Tensor) and v.ndim > 0
@@ -98,21 +99,21 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
             }
 
         if return_labels:
-            if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
                 inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values():
+            elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values():
+            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
                 inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
             elif model_class in [
-                *TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
-                *TF_MODEL_FOR_CAUSAL_LM_MAPPING.values(),
-                *TF_MODEL_FOR_MASKED_LM_MAPPING.values(),
-                *TF_MODEL_FOR_PRETRAINING_MAPPING.values(),
-                *TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
+                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
+                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
             ]:
                 inputs_dict["labels"] = tf.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
@@ -580,7 +581,7 @@ def test_compile_tf_model(self):
                     ),
                     "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
                 }
-            elif model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 input_ids = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
             else:
                 input_ids = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
@@ -796,9 +797,9 @@ def check_hidden_states_output(config, inputs_dict, model_class):
     def test_model_common_attributes(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         list_lm_models = (
-            list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.values())
-            + list(TF_MODEL_FOR_MASKED_LM_MAPPING.values())
-            + list(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values())
+            get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
         )
 
         for model_class in self.all_model_classes:
@@ -1128,7 +1129,7 @@ def test_loss_computation(self):
                 ]
                 loss_size = tf.size(added_label)
 
-                if model.__class__ in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values():
+                if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
                     # if loss is causal lm loss, labels are shift, so that one label per batch
                     # is cut
                     loss_size = loss_size - self.model_tester.batch_size
diff --git a/tests/test_processor_speech_to_text.py b/tests/test_processor_speech_to_text.py
index cf26e32c1db4bf..76a7a7446152d4 100644
--- a/tests/test_processor_speech_to_text.py
+++ b/tests/test_processor_speech_to_text.py
@@ -19,7 +19,7 @@
 from pathlib import Path
 from shutil import copyfile
 
-from transformers import Speech2TextFeatureExtractor, Speech2TextProcessor, Speech2TextTokenizer
+from transformers import Speech2TextTokenizer, is_speech_available
 from transformers.file_utils import FEATURE_EXTRACTOR_NAME
 from transformers.models.speech_to_text.tokenization_speech_to_text import VOCAB_FILES_NAMES, save_json
 from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
@@ -27,6 +27,10 @@
 from .test_feature_extraction_speech_to_text import floats_list
 
 
+if is_speech_available():
+    from transformers import Speech2TextFeatureExtractor, Speech2TextProcessor
+
+
 SAMPLE_SP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
 
 
diff --git a/tests/test_tokenization_cpm.py b/tests/test_tokenization_cpm.py
new file mode 100644
index 00000000000000..c65e8f07528d0e
--- /dev/null
+++ b/tests/test_tokenization_cpm.py
@@ -0,0 +1,39 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from transformers.models.cpm.tokenization_cpm import CpmTokenizer
+from transformers.testing_utils import custom_tokenizers
+
+from .test_modeling_xlnet import XLNetModelTest
+
+
+@custom_tokenizers
+class CpmTokenizationTest(XLNetModelTest):
+    def test_pre_tokenization(self):
+        tokenizer = CpmTokenizer.from_pretrained("TsinghuaAI/CPM-Generate")
+        text = "Hugging Face大法好，谁用谁知道。"
+        normalized_text = "Hugging Face大法好,谁用谁知道。<unk>"
+        bpe_tokens = "▁Hu gg ing ▁ ▂ ▁F ace ▁大法 ▁好 ▁ , ▁谁 ▁用 ▁谁 ▁知 道 ▁ 。".split()
+
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+
+        input_bpe_tokens = [13789, 13283, 1421, 8, 10, 1164, 13608, 16528, 63, 8, 9, 440, 108, 440, 121, 90, 8, 12, 0]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+        reconstructed_text = tokenizer.decode(input_bpe_tokens)
+        self.assertEqual(reconstructed_text, normalized_text)
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index ed1deaa8c21a1b..914e6f5bf2503b 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -132,6 +132,7 @@ def __init__(self, a=0, b=0, double_output=False, **kwargs):
         self.a = a
         self.b = b
         self.double_output = double_output
+        self.hidden_size = 1
 
 
 if is_torch_available():
diff --git a/tests/test_trainer_callback.py b/tests/test_trainer_callback.py
index 7f97766d318979..6ce90b85546d0a 100644
--- a/tests/test_trainer_callback.py
+++ b/tests/test_trainer_callback.py
@@ -234,7 +234,7 @@ def test_event_flow(self):
         self.assertEqual(events, self.get_expected_events(trainer))
 
         # warning should be emitted for duplicated callbacks
-        with unittest.mock.patch("transformers.trainer_callback.logger.warn") as warn_mock:
+        with unittest.mock.patch("transformers.trainer_callback.logger.warning") as warn_mock:
             trainer = self.get_trainer(
                 callbacks=[MyTestTrainerCallback, MyTestTrainerCallback],
             )
diff --git a/tests/test_versions_utils.py b/tests/test_versions_utils.py
index 04c6d78ec39d55..1d488b980b8393 100644
--- a/tests/test_versions_utils.py
+++ b/tests/test_versions_utils.py
@@ -14,8 +14,6 @@
 
 import sys
 
-import numpy
-
 from transformers.testing_utils import TestCasePlus
 from transformers.utils.versions import (
     importlib_metadata,
@@ -25,7 +23,7 @@
 )
 
 
-numpy_ver = numpy.__version__
+numpy_ver = importlib_metadata.version("numpy")
 python_ver = ".".join([str(x) for x in sys.version_info[:3]])
 
 
@@ -54,6 +52,9 @@ def test_core(self):
         # gt
         require_version_core("numpy>1.0.0")
 
+        # mix
+        require_version_core("numpy>1.0.0,<1000")
+
         # requirement w/o version
         require_version_core("numpy")
 
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index 20b348cea166ac..89965f97842147 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -22,11 +22,11 @@
 # python utils/check_dummies.py
 PATH_TO_TRANSFORMERS = "src/transformers"
 
+# Matches is_xxx_available()
+_re_backend = re.compile(r"is\_([a-z]*)_available()")
+# Matches from xxx import bla
 _re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
-_re_test_backend = re.compile(r"^\s+if\s+is\_([a-z]*)\_available\(\):\s*$")
-
-
-BACKENDS = ["torch", "tf", "flax", "sentencepiece", "tokenizers", "vision"]
+_re_test_backend = re.compile(r"^\s+if\s+is\_[a-z]*\_available\(\)")
 
 
 DUMMY_CONSTANT = """
@@ -36,25 +36,34 @@
 DUMMY_PRETRAINED_CLASS = """
 class {0}:
     def __init__(self, *args, **kwargs):
-        requires_{1}(self)
+        requires_backends(self, {1})
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_{1}(self)
+        requires_backends(self, {1})
 """
 
 DUMMY_CLASS = """
 class {0}:
     def __init__(self, *args, **kwargs):
-        requires_{1}(self)
+        requires_backends(self, {1})
 """
 
 DUMMY_FUNCTION = """
 def {0}(*args, **kwargs):
-    requires_{1}({0})
+    requires_backends({0}, {1})
 """
 
 
+def find_backend(line):
+    """Find one (or multiple) backend in a code line of the init."""
+    if _re_test_backend.search(line) is None:
+        return None
+    backends = [b[0] for b in _re_backend.findall(line)]
+    backends.sort()
+    return "_and_".join(backends)
+
+
 def read_init():
     """ Read the init and extracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects. """
     with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8", newline="\n") as f:
@@ -69,14 +78,10 @@ def read_init():
     # Go through the end of the file
     while line_index < len(lines):
         # If the line is an if is_backend_available, we grab all objects associated.
-        if _re_test_backend.search(lines[line_index]) is not None:
-            backend = _re_test_backend.search(lines[line_index]).groups()[0]
+        backend = find_backend(lines[line_index])
+        if backend is not None:
             line_index += 1
 
-            # Ignore if backend isn't tracked for dummies.
-            if backend not in BACKENDS:
-                continue
-
             objects = []
             # Until we unindent, add backend objects to the list
             while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 8):
@@ -128,13 +133,12 @@ def create_dummy_files():
     """ Create the content of the dummy files. """
     backend_specific_objects = read_init()
     # For special correspondence backend to module name as used in the function requires_modulename
-    module_names = {"torch": "pytorch"}
     dummy_files = {}
 
     for backend, objects in backend_specific_objects.items():
-        backend_name = module_names.get(backend, backend)
+        backend_name = "[" + ", ".join(f'"{b}"' for b in backend.split("_and_")) + "]"
         dummy_file = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
-        dummy_file += f"from ..file_utils import requires_{backend_name}\n\n"
+        dummy_file += "from ..file_utils import requires_backends\n\n"
         dummy_file += "\n".join([create_dummy_object(o, backend_name) for o in objects])
         dummy_files[backend] = dummy_file
 
@@ -156,8 +160,11 @@ def check_dummies(overwrite=False):
 
     actual_dummies = {}
     for backend, file_path in dummy_file_paths.items():
-        with open(file_path, "r", encoding="utf-8", newline="\n") as f:
-            actual_dummies[backend] = f.read()
+        if os.path.isfile(file_path):
+            with open(file_path, "r", encoding="utf-8", newline="\n") as f:
+                actual_dummies[backend] = f.read()
+        else:
+            actual_dummies[backend] = ""
 
     for backend in dummy_files.keys():
         if dummy_files[backend] != actual_dummies[backend]:
diff --git a/utils/check_inits.py b/utils/check_inits.py
index 7d024ed39515bc..1e4baa5feb3c6b 100644
--- a/utils/check_inits.py
+++ b/utils/check_inits.py
@@ -18,12 +18,14 @@
 
 
 PATH_TO_TRANSFORMERS = "src/transformers"
-BACKENDS = ["torch", "tf", "flax", "sentencepiece", "tokenizers", "vision"]
 
+
+# Matches is_xxx_available()
+_re_backend = re.compile(r"is\_([a-z]*)_available()")
 # Catches a line with a key-values pattern: "bla": ["foo", "bar"]
 _re_import_struct_key_value = re.compile(r'\s+"\S*":\s+\[([^\]]*)\]')
 # Catches a line if is_foo_available
-_re_test_backend = re.compile(r"^\s*if\s+is\_([a-z]*)\_available\(\):\s*$")
+_re_test_backend = re.compile(r"^\s*if\s+is\_[a-z]*\_available\(\)")
 # Catches a line _import_struct["bla"].append("foo")
 _re_import_struct_add_one = re.compile(r'^\s*_import_structure\["\S*"\]\.append\("(\S*)"\)')
 # Catches a line _import_struct["bla"].extend(["foo", "bar"]) or _import_struct["bla"] = ["foo", "bar"]
@@ -36,6 +38,15 @@
 _re_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
 
 
+def find_backend(line):
+    """Find one (or multiple) backend in a code line of the init."""
+    if _re_test_backend.search(line) is None:
+        return None
+    backends = [b[0] for b in _re_backend.findall(line)]
+    backends.sort()
+    return "_and_".join(backends)
+
+
 def parse_init(init_file):
     """
     Read an init_file and parse (per backend) the _import_structure objects defined and the TYPE_CHECKING objects
@@ -54,7 +65,7 @@ def parse_init(init_file):
 
     # First grab the objects without a specific backend in _import_structure
     objects = []
-    while not lines[line_index].startswith("if TYPE_CHECKING") and _re_test_backend.search(lines[line_index]) is None:
+    while not lines[line_index].startswith("if TYPE_CHECKING") and find_backend(lines[line_index]) is None:
         line = lines[line_index]
         single_line_import_search = _re_import_struct_key_value.search(line)
         if single_line_import_search is not None:
@@ -68,14 +79,10 @@ def parse_init(init_file):
     # Let's continue with backend-specific objects in _import_structure
     while not lines[line_index].startswith("if TYPE_CHECKING"):
         # If the line is an if is_backend_available, we grab all objects associated.
-        if _re_test_backend.search(lines[line_index]) is not None:
-            backend = _re_test_backend.search(lines[line_index]).groups()[0]
+        backend = find_backend(lines[line_index])
+        if backend is not None:
             line_index += 1
 
-            # Ignore if backend isn't tracked for dummies.
-            if backend not in BACKENDS:
-                continue
-
             objects = []
             # Until we unindent, add backend objects to the list
             while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 4):
@@ -106,7 +113,7 @@ def parse_init(init_file):
     objects = []
     while (
         line_index < len(lines)
-        and _re_test_backend.search(lines[line_index]) is None
+        and find_backend(lines[line_index]) is None
         and not lines[line_index].startswith("else")
     ):
         line = lines[line_index]
@@ -121,14 +128,10 @@ def parse_init(init_file):
     # Let's continue with backend-specific objects
     while line_index < len(lines):
         # If the line is an if is_backemd_available, we grab all objects associated.
-        if _re_test_backend.search(lines[line_index]) is not None:
-            backend = _re_test_backend.search(lines[line_index]).groups()[0]
+        backend = find_backend(lines[line_index])
+        if backend is not None:
             line_index += 1
 
-            # Ignore if backend isn't tracked for dummies.
-            if backend not in BACKENDS:
-                continue
-
             objects = []
             # Until we unindent, add backend objects to the list
             while len(lines[line_index]) <= 1 or lines[line_index].startswith(" " * 8):
diff --git a/utils/check_repo.py b/utils/check_repo.py
index b64f5ae2c761b8..4fa45d7c663ca9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -19,6 +19,8 @@
 import re
 from pathlib import Path
 
+from transformers.models.auto import get_values
+
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_repo.py
@@ -45,6 +47,10 @@
     "BlenderbotDecoderWrapper",  # Building part of bigger (tested) model.
     "MBartEncoder",  # Building part of bigger (tested) model.
     "MBartDecoderWrapper",  # Building part of bigger (tested) model.
+    "MegatronBertLMHeadModel",  # Building part of bigger (tested) model.
+    "MegatronBertEncoder",  # Building part of bigger (tested) model.
+    "MegatronBertDecoder",  # Building part of bigger (tested) model.
+    "MegatronBertDecoderWrapper",  # Building part of bigger (tested) model.
     "PegasusEncoder",  # Building part of bigger (tested) model.
     "PegasusDecoderWrapper",  # Building part of bigger (tested) model.
     "DPREncoder",  # Building part of bigger (tested) model.
@@ -79,60 +85,24 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = [
     # models to ignore for model xxx mapping
-    "M2M100Encoder",
-    "M2M100Decoder",
-    "Speech2TextEncoder",
-    "Speech2TextDecoder",
-    "LEDEncoder",
-    "LEDDecoder",
-    "BartDecoder",
-    "BartDecoderWrapper",
-    "BartEncoder",
-    "BlenderbotSmallEncoder",
-    "BlenderbotSmallDecoder",
-    "BlenderbotSmallDecoderWrapper",
-    "BlenderbotEncoder",
-    "BlenderbotDecoder",
-    "BlenderbotDecoderWrapper",
-    "DPRContextEncoder",
-    "DPREncoder",
     "DPRReader",
     "DPRSpanPredictor",
     "FlaubertForQuestionAnswering",
-    "FunnelBaseModel",
     "GPT2DoubleHeadsModel",
-    "MT5EncoderModel",
-    "MBartEncoder",
-    "MBartDecoder",
-    "MBartDecoderWrapper",
     "OpenAIGPTDoubleHeadsModel",
-    "PegasusEncoder",
-    "PegasusDecoder",
-    "PegasusDecoderWrapper",
-    "ProphetNetDecoder",
-    "ProphetNetEncoder",
-    "ProphetNetDecoderWrapper",
     "RagModel",
     "RagSequenceForGeneration",
     "RagTokenForGeneration",
     "T5Stack",
-    "T5EncoderModel",
-    "TFDPRContextEncoder",
-    "TFDPREncoder",
     "TFDPRReader",
     "TFDPRSpanPredictor",
-    "TFFunnelBaseModel",
     "TFGPT2DoubleHeadsModel",
-    "TFMT5EncoderModel",
     "TFOpenAIGPTDoubleHeadsModel",
     "TFRagModel",
     "TFRagSequenceForGeneration",
     "TFRagTokenForGeneration",
-    "TFT5EncoderModel",
     "Wav2Vec2ForCTC",
     "XLMForQuestionAnswering",
-    "XLMProphetNetDecoder",
-    "XLMProphetNetEncoder",
     "XLNetForQuestionAnswering",
     "SeparableConv1D",
 ]
@@ -183,7 +153,7 @@ def get_model_modules():
 def get_models(module):
     """ Get the objects in module that are models."""
     models = []
-    model_classes = (transformers.PreTrainedModel, transformers.TFPreTrainedModel)
+    model_classes = (transformers.PreTrainedModel, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel)
     for attr_name in dir(module):
         if "Pretrained" in attr_name or "PreTrained" in attr_name:
             continue
@@ -279,19 +249,33 @@ def get_all_auto_configured_models():
     result = set()  # To avoid duplicates we concatenate all model classes in a set.
     for attr_name in dir(transformers.models.auto.modeling_auto):
         if attr_name.startswith("MODEL_") and attr_name.endswith("MAPPING"):
-            result = result | set(getattr(transformers.models.auto.modeling_auto, attr_name).values())
+            result = result | set(get_values(getattr(transformers.models.auto.modeling_auto, attr_name)))
     for attr_name in dir(transformers.models.auto.modeling_tf_auto):
         if attr_name.startswith("TF_MODEL_") and attr_name.endswith("MAPPING"):
-            result = result | set(getattr(transformers.models.auto.modeling_tf_auto, attr_name).values())
+            result = result | set(get_values(getattr(transformers.models.auto.modeling_tf_auto, attr_name)))
+    for attr_name in dir(transformers.models.auto.modeling_flax_auto):
+        if attr_name.startswith("FLAX_MODEL_") and attr_name.endswith("MAPPING"):
+            result = result | set(get_values(getattr(transformers.models.auto.modeling_flax_auto, attr_name)))
     return [cls.__name__ for cls in result]
 
 
+def ignore_unautoclassed(model_name):
+    """Rules to determine if `name` should be in an auto class."""
+    # Special white list
+    if model_name in IGNORE_NON_AUTO_CONFIGURED:
+        return True
+    # Encoder and Decoder should be ignored
+    if "Encoder" in model_name or "Decoder" in model_name:
+        return True
+    return False
+
+
 def check_models_are_auto_configured(module, all_auto_models):
     """ Check models defined in module are each in an auto class."""
     defined_models = get_models(module)
     failures = []
     for model_name, _ in defined_models:
-        if model_name not in all_auto_models and model_name not in IGNORE_NON_AUTO_CONFIGURED:
+        if model_name not in all_auto_models and not ignore_unautoclassed(model_name):
             failures.append(
                 f"{model_name} is defined in {module.__name__} but is not present in any of the auto mapping. "
                 "If that is intended behavior, add its name to `IGNORE_NON_AUTO_CONFIGURED` in the file "
@@ -414,6 +398,7 @@ def find_all_documented_objects():
     "convert_tf_weight_name_to_pt_weight_name",  # Internal used to convert model weights
     "logger",  # Internal logger
     "logging",  # External module
+    "requires_backends",  # Internal function
 ]
 
 # This list should be empty. Objects in it should get their own doc page.

From 8ac38f6fa70b7edceaec2c2e39aa0fb4fae79661 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 15 Apr 2021 19:00:32 -0700
Subject: [PATCH 02/20] add activation overflow debug utility

---
 docs/source/debugging.md                    | 106 ++++++++++++++
 docs/source/index.rst                       |   3 +-
 src/transformers/debug_utils.py             | 152 ++++++++++++++++++++
 src/transformers/tokenization_utils_base.py |   2 +-
 src/transformers/trainer.py                 |   8 +-
 src/transformers/training_args.py           |  32 ++++-
 6 files changed, 295 insertions(+), 8 deletions(-)
 create mode 100644 docs/source/debugging.md
 create mode 100644 src/transformers/debug_utils.py

diff --git a/docs/source/debugging.md b/docs/source/debugging.md
new file mode 100644
index 00000000000000..f16e6e09570b3d
--- /dev/null
+++ b/docs/source/debugging.md
@@ -0,0 +1,106 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Debugging
+
+
+## Activations Overflow
+
+If you start getting `loss=NaN` or the model inhibits some other abnormal behavior due to `inf`s or `nan`s one needs to discover where the first overflow happens and what led to it. Luckily you can accomplish that easily by activating a special module that will do the detection automatically.
+
+If you're using the HuggingFace `Trainer`, you just need to add:
+
+```bash
+--debug activation_overflow
+```
+to the normal command line arguments, or pass `debug="activation_overflow"` when creating the `Trainer` object.
+
+If you're using your own trainer you can just do:
+
+```python
+from .debug_utils import DebugActivationOverflow
+debug_overflow = DebugActivationOverflow(model)
+```
+
+`DebugActivationOverflow` inserts hooks into the model that will test each input and output and as soon as `inf` or `nan` is detected in at least one element, the program will assert and print a report like this:
+
+```
+< [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output has infs
+
+
+last 40 frames:
+abs_max= 5.96e+02 < [0] encoder.block.1.layer.1.DenseReluDense.dropout: Dropout: output
+abs_max= 5.96e+02 > [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: input[0]
+abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: output
+abs_max= 2.57e+00 > [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: input[0]
+abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: output
+abs_max= 3.17e+03 > [0] encoder.block.1.layer.1.dropout: Dropout: input[0]
+abs_max= 3.52e+03 < [0] encoder.block.1.layer.1.dropout: Dropout: output
+abs_max= 1.58e+03 > [0] encoder.block.1.layer.1: T5LayerFF: input[0]
+abs_max= 4.04e+03 < [0] encoder.block.1.layer.1: T5LayerFF: output
+abs_max= 1.51e+03 > [0] encoder.block.1: T5Block: input[0]
+abs_max= 4.04e+03 < [0] encoder.block.1: T5Block: output[0]
+abs_max= 1.00e+04 < [0] encoder.block.1: T5Block: output[2]
+abs_max= 4.04e+03 > [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: input[0]
+abs_max= 2.69e+00 < [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: output
+abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.q: Linear: input[0]
+abs_max= 1.13e+00 < [0] encoder.block.2.layer.0.SelfAttention.q: Linear: output
+abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.k: Linear: input[0]
+abs_max= 1.69e+01 < [0] encoder.block.2.layer.0.SelfAttention.k: Linear: output
+abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.v: Linear: input[0]
+abs_max= 8.92e+00 < [0] encoder.block.2.layer.0.SelfAttention.v: Linear: output
+abs_max= 7.59e+00 > [0] encoder.block.2.layer.0.SelfAttention.o: Linear: input[0]
+abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention.o: Linear: output
+abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention: T5Attention: input[0]
+abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[0]
+abs_max= 1.00e+04 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[2]
+abs_max= 2.83e+02 > [0] encoder.block.2.layer.0.dropout: Dropout: input[0]
+abs_max= 3.14e+02 < [0] encoder.block.2.layer.0.dropout: Dropout: output
+abs_max= 4.04e+03 > [0] encoder.block.2.layer.0: T5LayerSelfAttention: input[0]
+abs_max= 4.06e+03 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[0]
+abs_max= 1.00e+04 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[2]
+abs_max= 4.06e+03 > [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: input[0]
+abs_max= 6.00e+00 < [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: output
+abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: input[0]
+abs_max= 5.18e+01 < [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: output
+abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: input[0]
+abs_max= 3.14e+02 < [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: output
+abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
+abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
+abs_max= 1.80e+04 > [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: input[0]
+abs_max=      inf < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output
+```
+
+The left column shows the value of the absolute largest element, so if you have a closer look the last few frames, the inputs and outputs were in the range of 10000. So when this training was done under mixed precision the very last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under `fp16` the activations must remain way below `1e4`, because `1e4*1e4 = 1e8` so any matrix multiply with large activations is going to lead to overflow.
+
+The trace then prints the batch number (here `[0]` means the problem occurred on the first batch).
+
+Then comes the fully qualified entry from the `state_dict`, e.g.: `encoder.block.2.layer.0.layer_norm`, so you can easily see where the problem happens and what was happening just before it.
+
+The second to last entry show the name of the class the `forward` belongs to, and whether the report is for an input or an output and its index if either is a tuple. Only tensor variables are reported.
+
+Another shortcut in the first columns is`>` is for input variable, `<` is for output.
+
+Let's look at:
+
+```
+abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
+abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
+```
+
+This is a report for `Dropout.forward` function with the first entry for the only input and the second for the only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see that it happened during the first layer, of the 2nd block, during the very first batch. Finally the absolute largest input elements was `1.62e+04` and same for the output was `1.80e+04`.
+
+Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied or summed up. Of course, there might be other solutions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ebf09989e682e3..a980b959ade3d0 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -22,7 +22,7 @@ State-of-the-art NLP for everyone:
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators
 
-.. 
+..
     Copyright 2020 The HuggingFace Team. All rights reserved.
 
     Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -399,6 +399,7 @@ TensorFlow and/or Flax.
     add_new_model
     fast_tokenizers
     testing
+    debugging
     serialization
 
 .. toctree::
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
new file mode 100644
index 00000000000000..6817efecbd809b
--- /dev/null
+++ b/src/transformers/debug_utils.py
@@ -0,0 +1,152 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import collections
+
+from .utils import logging
+from .file_utils import ExplicitEnum
+
+logger = logging.get_logger(__name__)
+
+
+class DebugActivationOverflow():
+    """
+
+
+    """
+    def __init__(self, model, max_frames_to_save=40):
+        self.model = model
+
+        # keep a LIFO buffer of frames to dump as soon as inf/nan is encountered to give context to the problem emergence
+        self.frames = collections.deque([], max_frames_to_save)
+        self.save_frames = True
+        self.step = 0
+
+        self.analyse_model()
+
+        self.register_forward_hook()
+
+    def save_frame(self, frame):
+        self.frames.append(frame)
+
+    def dump_saved_frames_once(self):
+        # dump the previous frames only once (to help debug)
+        if self.save_frames:
+            print(f"\n\nlast {len(self.frames)} frames:")
+            print("\n".join(self.frames))
+            print("\n\n")
+            self.save_frames = False
+
+    def analyse_model(self):
+        # extract the fully qualified module names, to be able to report at run time. e.g.:
+        # encoder.block.2.layer.0.SelfAttention.o
+        #
+        # for shared weights only the first shared module name will be registered
+        self.module_names = {m:name for name, m in self.model.named_modules()}
+
+    def analyse_variable(self, var, ctx):
+        if torch.is_tensor(var):
+            if self.save_frames:
+                self.save_frame(get_abs_max(var, ctx))
+
+            if detect_overflow(var, ctx):
+                self.dump_saved_frames_once()
+
+                # now we can die, as it's pointless to continue running
+                raise ValueError("DebugActivationOverflow: inf/nan detected, aborting as there is no point running further. "
+                                 "Please scroll up above this traceback to see the activation values prior to this event.")
+
+    def register_forward_hook(self):
+        self.model.apply(self._register_forward_hook)
+
+    def _register_forward_hook(self, module):
+        module.register_forward_hook(self.forward_hook)
+
+    def forward_hook(self, module, input, output):
+        # - input is a tuple of packed inputs (could be non-Tensors)
+        # - output could be a Tensor or a tuple of Tensors and non-Tensors
+
+        # count at which step we are (batch number)
+        if module == self.model:
+            self.step += 1
+
+        ctx = f"[{self.step}] {self.module_names[module]}: {module.__class__.__name__}"
+
+        for i,x in enumerate(input):
+            self.analyse_variable(x, f"> {ctx}: input[{i}]")
+
+        if isinstance(output, tuple):
+            for i,x in enumerate(output):
+                # possibly a tuple of tuples
+                if isinstance(x, tuple):
+                    for j,y in enumerate(x):
+                        self.analyse_variable(y, f"< {ctx}: output[{i}][{j}]")
+                else:
+                    self.analyse_variable(x, f"< {ctx}: output[{i}]")
+        else:
+            self.analyse_variable(output, f"< {ctx}: output")
+
+def get_abs_max(var, ctx):
+    abs_max = max(abs(var.min()), abs(var.max()))
+    return f"abs_max={abs_max:9.2e} {ctx}"
+
+def get_min_max(var, ctx):
+    return f"min={var.min():9.2e} max={var.max():9.2e} {ctx}"
+
+def detect_overflow(var, ctx):
+    """
+    Report the count of ``nan`` and ``inf`` entries in the tensor.
+
+    This is useful for detecting overflows/underflows and best to call right after the function that did some math that
+    modified the variable in question.
+
+    Args:
+        var: tensor variable to check
+        ctx: the message to print as a context
+
+    Return:
+        True if inf or nan was detected, False otherwise
+    """
+    detected = False
+    if torch.isnan(var).any().item():
+        detected = True
+        print(f"{ctx} has nans")
+    if torch.isinf(var).any().item():
+        detected = True
+        print(f"{ctx} has infs")
+
+    # if needed to monitor large elements can enable the following
+    if 0: # and detected:
+        n100 = var[torch.ge(var.abs(), 100)]
+        if n100.numel() > 0:
+            print(f"{ctx}:  n100={n100.numel()}")
+        n1000 = var[torch.ge(var.abs(), 1000)]
+        if n1000.numel() > 0:
+            print(f"{ctx}: n1000={n1000.numel()}")
+        n10000 = var[torch.ge(var.abs(), 10000)]
+        if n10000.numel() > 0:
+            print(f"{ctx}: n10000={n10000.numel()}")
+
+    if 0:
+#        print(f"         min={var.min():9.2e} max={var.max():9.2e} var={var.var():9.2e} mean={var.mean():9.2e} ({ctx})")
+        print(f"         min={var.min():9.2e} max={var.max():9.2e}")
+
+    return detected
+
+
+
+class DebugOption(ExplicitEnum):
+    ACIVATION_OVERFLOW = "activation_overflow"
+    TPU_METRICS_DEBUG = "tpu_metrics_debug"
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 7b68164b914467..9764a1c8d59816 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -3128,7 +3128,7 @@ def clean_up_tokenization(out_string: str) -> str:
 
     def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
         """
-        Depending on the input and internal state we might trigger a warning about a sequence that is too long for it's
+        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
         corresponding model
 
         Args:
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 41800b7fd3a32c..72fecb74b74641 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -54,6 +54,7 @@
 from torch.utils.data.sampler import RandomSampler, SequentialSampler
 
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
+from .debug_utils import DebugActivationOverflow, DebugOption
 from .dependency_versions_check import dep_version_check
 from .file_utils import (
     WEIGHTS_NAME,
@@ -961,6 +962,9 @@ def train(
             num_train_epochs = 1
             num_update_steps_per_epoch = max_steps
 
+        if DebugOption.ACIVATION_OVERFLOW in self.args.debug:
+            debug_overflow = DebugActivationOverflow(self.model)  # noqa
+
         delay_optimizer_creation = self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE
         if self.args.deepspeed:
             deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
@@ -1179,7 +1183,7 @@ def train(
             self.control = self.callback_handler.on_epoch_end(self.args, self.state, self.control)
             self._maybe_log_save_evaluate(tr_loss, model, trial, epoch)
 
-            if self.args.tpu_metrics_debug or self.args.debug:
+            if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
                 if is_torch_tpu_available():
                     # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
                     xm.master_print(met.metrics_report())
@@ -1787,7 +1791,7 @@ def evaluate(
         output.metrics.update(speed_metrics(metric_key_prefix, start_time, n_samples))
         self.log(output.metrics)
 
-        if self.args.tpu_metrics_debug or self.args.debug:
+        if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
             # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
             xm.master_print(met.metrics_report())
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 188bf92b63df05..9ca78abf2198dd 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -19,6 +19,7 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional
 
+from .debug_utils import DebugOption
 from .file_utils import (
     cached_property,
     is_sagemaker_dp_enabled,
@@ -194,8 +195,6 @@ class TrainingArguments:
             Rank of the process during distributed training.
         tpu_num_cores (:obj:`int`, `optional`):
             When training on TPU, the number of TPU cores (automatically passed by launcher script).
-        debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            When training on TPU, whether to print debug metrics or not.
         dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
@@ -277,6 +276,16 @@ class TrainingArguments:
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
             labels are changed from 0s and 1s to :obj:`label_smoothing_factor/num_labels` and :obj:`1 -
             label_smoothing_factor + label_smoothing_factor/num_labels` respectively.
+        debug (:obj:`str` or list of :class:`~transformers.debug_utils.DebugOption`, `optional`, defaults to :obj:``):
+            Enable one or more debug features. This is an experimental feature.
+
+            Possible options are:
+
+            - :obj:`"activation_overflow"`: detects overflow in model's input/outputs and reports the last frames that
+              led to the event
+            - :obj:`"tpu_metrics_debug"`: print debug metrics on TPU
+
+            The options should be separated by whitespaces.
         adafactor (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether or not to use the :class:`~transformers.Adafactor` optimizer instead of
             :class:`~transformers.AdamW`.
@@ -431,9 +440,18 @@ class TrainingArguments:
     )
     tpu_metrics_debug: bool = field(
         default=False,
-        metadata={"help": "Deprecated, the use of `--debug` is preferred. TPU: Whether to print debug metrics"},
+        metadata={
+            "help": "Deprecated, the use of `--debug tpu_metrics_debug` is preferred. TPU: Whether to print debug metrics"
+        },
+    )
+    debug: str = field(
+        default="",
+        metadata={
+            "help": "Whether or not to enable debug mode. Current options: "
+            "`activation_overflow` (Detect overflow in activations), "
+            "`tpu_metrics_debug` (print debug metrics on TPU)."
+        },
     )
-    debug: bool = field(default=False, metadata={"help": "Whether to print debug metrics on TPU"})
 
     dataloader_drop_last: bool = field(
         default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
@@ -615,6 +633,12 @@ def __post_init__(self):
         elif ShardedDDPOption.ZERO_DP_2 in self.sharded_ddp and ShardedDDPOption.ZERO_DP_3 in self.sharded_ddp:
             raise ValueError("`--sharded_ddp zero_dp_2` is not compatible with `--sharded_ddp zero_dp_3`.")
 
+        if self.tpu_metrics_debug:
+            self.debug += " tpu_metrics_debug"
+            self.tpu_metrics_debug = False
+        if isinstance(self.debug, str):
+            self.debug = [DebugOption(s) for s in self.debug.split()]
+
     def __repr__(self):
         # We override the default repr to remove deprecated arguments from the repr. This method should be removed once
         # those deprecated arguments are removed form TrainingArguments. (TODO: v5)

From 9c2137e6b2a24b908185e9a0fe22191af5ecc0d5 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 15 Apr 2021 19:24:24 -0700
Subject: [PATCH 03/20] cleanup

---
 src/transformers/debug_utils.py               | 54 ++++++++++++++-----
 src/transformers/dependency_versions_table.py |  2 +-
 src/transformers/training_args.py             |  2 +-
 3 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 6817efecbd809b..11c8c3d8c024b0 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -12,20 +12,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 import collections
 
-from .utils import logging
+import torch
+
 from .file_utils import ExplicitEnum
+from .utils import logging
+
 
 logger = logging.get_logger(__name__)
 
 
-class DebugActivationOverflow():
+class DebugActivationOverflow:
     """
+    This debug class helps detect and understand where the model starts getting ``nan``s or ``inf``s in activation
+    elements.
+
+    To activate, initialize the object with the model ::
+
+        debug_overflow = DebugActivationOverflow(model)
+
+    then run the training as normal and if any ``nan`` or ``inf`` get detected this module will throw an exception and
+    will print several dozens of frames that lead to this event, each line reporting:
 
+    1. the absolute largest element of either input or output variable
+    2. the batch number
+    3. the fully qualified state_dict key of which element it was run for,
+    4. the class name whose ``forward`` was run
+    5. and finally whether it was an input or output and its index if it was a tuple.
 
+    Args:
+        model (:obj:`nn.Module`):
+            the model that fails to train due to ``nan``s or ``inf``s
+        max_frames_to_save (:obj:`int`, `optional`):
+            how many variables and their frames to record back - a few dozens is a good number, defaults to 40):
     """
+
     def __init__(self, model, max_frames_to_save=40):
         self.model = model
 
@@ -54,7 +76,7 @@ def analyse_model(self):
         # encoder.block.2.layer.0.SelfAttention.o
         #
         # for shared weights only the first shared module name will be registered
-        self.module_names = {m:name for name, m in self.model.named_modules()}
+        self.module_names = {m: name for name, m in self.model.named_modules()}
 
     def analyse_variable(self, var, ctx):
         if torch.is_tensor(var):
@@ -65,8 +87,10 @@ def analyse_variable(self, var, ctx):
                 self.dump_saved_frames_once()
 
                 # now we can die, as it's pointless to continue running
-                raise ValueError("DebugActivationOverflow: inf/nan detected, aborting as there is no point running further. "
-                                 "Please scroll up above this traceback to see the activation values prior to this event.")
+                raise ValueError(
+                    "DebugActivationOverflow: inf/nan detected, aborting as there is no point running further. "
+                    "Please scroll up above this traceback to see the activation values prior to this event."
+                )
 
     def register_forward_hook(self):
         self.model.apply(self._register_forward_hook)
@@ -84,27 +108,30 @@ def forward_hook(self, module, input, output):
 
         ctx = f"[{self.step}] {self.module_names[module]}: {module.__class__.__name__}"
 
-        for i,x in enumerate(input):
+        for i, x in enumerate(input):
             self.analyse_variable(x, f"> {ctx}: input[{i}]")
 
         if isinstance(output, tuple):
-            for i,x in enumerate(output):
+            for i, x in enumerate(output):
                 # possibly a tuple of tuples
                 if isinstance(x, tuple):
-                    for j,y in enumerate(x):
+                    for j, y in enumerate(x):
                         self.analyse_variable(y, f"< {ctx}: output[{i}][{j}]")
                 else:
                     self.analyse_variable(x, f"< {ctx}: output[{i}]")
         else:
             self.analyse_variable(output, f"< {ctx}: output")
 
+
 def get_abs_max(var, ctx):
     abs_max = max(abs(var.min()), abs(var.max()))
     return f"abs_max={abs_max:9.2e} {ctx}"
 
+
 def get_min_max(var, ctx):
     return f"min={var.min():9.2e} max={var.max():9.2e} {ctx}"
 
+
 def detect_overflow(var, ctx):
     """
     Report the count of ``nan`` and ``inf`` entries in the tensor.
@@ -128,7 +155,7 @@ def detect_overflow(var, ctx):
         print(f"{ctx} has infs")
 
     # if needed to monitor large elements can enable the following
-    if 0: # and detected:
+    if 0:  # and detected:
         n100 = var[torch.ge(var.abs(), 100)]
         if n100.numel() > 0:
             print(f"{ctx}:  n100={n100.numel()}")
@@ -140,11 +167,12 @@ def detect_overflow(var, ctx):
             print(f"{ctx}: n10000={n10000.numel()}")
 
     if 0:
-#        print(f"         min={var.min():9.2e} max={var.max():9.2e} var={var.var():9.2e} mean={var.mean():9.2e} ({ctx})")
-        print(f"         min={var.min():9.2e} max={var.max():9.2e}")
+        print(f"min={var.min():9.2e} max={var.max():9.2e}")
 
-    return detected
+    if 0:
+        print(f"min={var.min():9.2e} max={var.max():9.2e} var={var.var():9.2e} mean={var.mean():9.2e} ({ctx})")
 
+    return detected
 
 
 class DebugOption(ExplicitEnum):
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 82968ff299491a..cfd1a6c86d054d 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -7,7 +7,7 @@
     "cookiecutter": "cookiecutter==1.7.2",
     "dataclasses": "dataclasses",
     "datasets": "datasets",
-    "deepspeed": "deepspeed>0.3.13",
+    "deepspeed": "deepspeed>=0.3.14",
     "docutils": "docutils==0.16.0",
     "fairscale": "fairscale>0.3",
     "faiss-cpu": "faiss-cpu",
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 251371fe375c27..74141e57ff456b 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -276,7 +276,7 @@ class TrainingArguments:
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
             labels are changed from 0s and 1s to :obj:`label_smoothing_factor/num_labels` and :obj:`1 -
             label_smoothing_factor + label_smoothing_factor/num_labels` respectively.
-        debug (:obj:`str` or list of :class:`~transformers.debug_utils.DebugOption`, `optional`, defaults to :obj:``):
+        debug (:obj:`str` or list of :class:`~transformers.debug_utils.DebugOption`, `optional`, defaults to :obj:`""`):
             Enable one or more debug features. This is an experimental feature.
 
             Possible options are:

From 575aeb659292c8979eca7f6705b13df0cda9b301 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 15 Apr 2021 19:35:52 -0700
Subject: [PATCH 04/20] document detect_overflow

---
 docs/source/debugging.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/source/debugging.md b/docs/source/debugging.md
index f16e6e09570b3d..9dd4c7e8ca8216 100644
--- a/docs/source/debugging.md
+++ b/docs/source/debugging.md
@@ -104,3 +104,20 @@ abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout:
 This is a report for `Dropout.forward` function with the first entry for the only input and the second for the only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see that it happened during the first layer, of the 2nd block, during the very first batch. Finally the absolute largest input elements was `1.62e+04` and same for the output was `1.80e+04`.
 
 Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied or summed up. Of course, there might be other solutions.
+
+Since the automatic detector only reports inputs and outputs, once you know where to look, you may want to analyse the intermediary stages of `forward` as well. In such a case you can use the helper function to inject the detector where you want it, for example:
+
+```
+from debug_utils import detect_overflow
+
+class T5LayerFF(nn.Module):
+    [...]
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        detect_overflow(forwarded_states, "after layer_norm")
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        detect_overflow(forwarded_states, "after DenseReluDense")
+        return hidden_states + self.dropout(forwarded_states)
+```
+
+You can see that we added 2 of these and now we can know the absolute largest numbers for `forwarded_states` at 2 different stages.

From 39815b92d448d5a4f3b7c58f17b57723282575b0 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 15 Apr 2021 19:39:22 -0700
Subject: [PATCH 05/20] import torch

---
 src/transformers/debug_utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 11c8c3d8c024b0..e358de00278445 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -14,12 +14,14 @@
 
 import collections
 
-import torch
-
-from .file_utils import ExplicitEnum
+from .file_utils import ExplicitEnum, is_torch_available
 from .utils import logging
 
 
+if is_torch_available():
+    import torch
+
+
 logger = logging.get_logger(__name__)
 
 

From bb66de7098467f5c49bb90994d8cd4db1539293b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 16 Apr 2021 09:54:00 -0700
Subject: [PATCH 06/20] add deprecation warning

---
 src/transformers/training_args.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 74141e57ff456b..94e5c5d87ef925 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -634,6 +634,10 @@ def __post_init__(self):
             raise ValueError("`--sharded_ddp zero_dp_2` is not compatible with `--sharded_ddp zero_dp_3`.")
 
         if self.tpu_metrics_debug:
+            warnings.warn(
+                "using `--tpu_metrics_debug` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--debug tpu_metrics_debug` instead",
+                FutureWarning,
+            )
             self.debug += " tpu_metrics_debug"
             self.tpu_metrics_debug = False
         if isinstance(self.debug, str):

From 58ee50ef2f8bb518c4d71c6fe3c389663e2774ff Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:55:41 -0700
Subject: [PATCH 07/20] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/debugging.md        | 2 +-
 src/transformers/debug_utils.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/debugging.md b/docs/source/debugging.md
index 9dd4c7e8ca8216..51982b8d98e723 100644
--- a/docs/source/debugging.md
+++ b/docs/source/debugging.md
@@ -92,7 +92,7 @@ Then comes the fully qualified entry from the `state_dict`, e.g.: `encoder.block
 
 The second to last entry show the name of the class the `forward` belongs to, and whether the report is for an input or an output and its index if either is a tuple. Only tensor variables are reported.
 
-Another shortcut in the first columns is`>` is for input variable, `<` is for output.
+Another shortcut in the first columns `>` is for input variable, `<` is for output.
 
 Let's look at:
 
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index e358de00278445..c247107544c0e0 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -45,9 +45,9 @@ class DebugActivationOverflow:
 
     Args:
         model (:obj:`nn.Module`):
-            the model that fails to train due to ``nan``s or ``inf``s
-        max_frames_to_save (:obj:`int`, `optional`):
-            how many variables and their frames to record back - a few dozens is a good number, defaults to 40):
+            The model to debug.
+        max_frames_to_save (:obj:`int`, `optional`, defaults to 40):
+            How many variables and their frames to record back - a few dozens is a good number.
     """
 
     def __init__(self, model, max_frames_to_save=40):

From f3d1145b6aacde476470bd620aba94316632cc5d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 16 Apr 2021 10:21:13 -0700
Subject: [PATCH 08/20] convert to rst, add note

---
 docs/source/debugging.md  | 123 ------------------------------------
 docs/source/debugging.rst | 127 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 123 deletions(-)
 delete mode 100644 docs/source/debugging.md
 create mode 100644 docs/source/debugging.rst

diff --git a/docs/source/debugging.md b/docs/source/debugging.md
deleted file mode 100644
index 51982b8d98e723..00000000000000
--- a/docs/source/debugging.md
+++ /dev/null
@@ -1,123 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Debugging
-
-
-## Activations Overflow
-
-If you start getting `loss=NaN` or the model inhibits some other abnormal behavior due to `inf`s or `nan`s one needs to discover where the first overflow happens and what led to it. Luckily you can accomplish that easily by activating a special module that will do the detection automatically.
-
-If you're using the HuggingFace `Trainer`, you just need to add:
-
-```bash
---debug activation_overflow
-```
-to the normal command line arguments, or pass `debug="activation_overflow"` when creating the `Trainer` object.
-
-If you're using your own trainer you can just do:
-
-```python
-from .debug_utils import DebugActivationOverflow
-debug_overflow = DebugActivationOverflow(model)
-```
-
-`DebugActivationOverflow` inserts hooks into the model that will test each input and output and as soon as `inf` or `nan` is detected in at least one element, the program will assert and print a report like this:
-
-```
-< [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output has infs
-
-
-last 40 frames:
-abs_max= 5.96e+02 < [0] encoder.block.1.layer.1.DenseReluDense.dropout: Dropout: output
-abs_max= 5.96e+02 > [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: input[0]
-abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: output
-abs_max= 2.57e+00 > [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: input[0]
-abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: output
-abs_max= 3.17e+03 > [0] encoder.block.1.layer.1.dropout: Dropout: input[0]
-abs_max= 3.52e+03 < [0] encoder.block.1.layer.1.dropout: Dropout: output
-abs_max= 1.58e+03 > [0] encoder.block.1.layer.1: T5LayerFF: input[0]
-abs_max= 4.04e+03 < [0] encoder.block.1.layer.1: T5LayerFF: output
-abs_max= 1.51e+03 > [0] encoder.block.1: T5Block: input[0]
-abs_max= 4.04e+03 < [0] encoder.block.1: T5Block: output[0]
-abs_max= 1.00e+04 < [0] encoder.block.1: T5Block: output[2]
-abs_max= 4.04e+03 > [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: input[0]
-abs_max= 2.69e+00 < [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: output
-abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.q: Linear: input[0]
-abs_max= 1.13e+00 < [0] encoder.block.2.layer.0.SelfAttention.q: Linear: output
-abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.k: Linear: input[0]
-abs_max= 1.69e+01 < [0] encoder.block.2.layer.0.SelfAttention.k: Linear: output
-abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.v: Linear: input[0]
-abs_max= 8.92e+00 < [0] encoder.block.2.layer.0.SelfAttention.v: Linear: output
-abs_max= 7.59e+00 > [0] encoder.block.2.layer.0.SelfAttention.o: Linear: input[0]
-abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention.o: Linear: output
-abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention: T5Attention: input[0]
-abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[0]
-abs_max= 1.00e+04 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[2]
-abs_max= 2.83e+02 > [0] encoder.block.2.layer.0.dropout: Dropout: input[0]
-abs_max= 3.14e+02 < [0] encoder.block.2.layer.0.dropout: Dropout: output
-abs_max= 4.04e+03 > [0] encoder.block.2.layer.0: T5LayerSelfAttention: input[0]
-abs_max= 4.06e+03 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[0]
-abs_max= 1.00e+04 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[2]
-abs_max= 4.06e+03 > [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: input[0]
-abs_max= 6.00e+00 < [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: output
-abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: input[0]
-abs_max= 5.18e+01 < [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: output
-abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: input[0]
-abs_max= 3.14e+02 < [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: output
-abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
-abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
-abs_max= 1.80e+04 > [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: input[0]
-abs_max=      inf < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output
-```
-
-The left column shows the value of the absolute largest element, so if you have a closer look the last few frames, the inputs and outputs were in the range of 10000. So when this training was done under mixed precision the very last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under `fp16` the activations must remain way below `1e4`, because `1e4*1e4 = 1e8` so any matrix multiply with large activations is going to lead to overflow.
-
-The trace then prints the batch number (here `[0]` means the problem occurred on the first batch).
-
-Then comes the fully qualified entry from the `state_dict`, e.g.: `encoder.block.2.layer.0.layer_norm`, so you can easily see where the problem happens and what was happening just before it.
-
-The second to last entry show the name of the class the `forward` belongs to, and whether the report is for an input or an output and its index if either is a tuple. Only tensor variables are reported.
-
-Another shortcut in the first columns `>` is for input variable, `<` is for output.
-
-Let's look at:
-
-```
-abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
-abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
-```
-
-This is a report for `Dropout.forward` function with the first entry for the only input and the second for the only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see that it happened during the first layer, of the 2nd block, during the very first batch. Finally the absolute largest input elements was `1.62e+04` and same for the output was `1.80e+04`.
-
-Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied or summed up. Of course, there might be other solutions.
-
-Since the automatic detector only reports inputs and outputs, once you know where to look, you may want to analyse the intermediary stages of `forward` as well. In such a case you can use the helper function to inject the detector where you want it, for example:
-
-```
-from debug_utils import detect_overflow
-
-class T5LayerFF(nn.Module):
-    [...]
-    def forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        detect_overflow(forwarded_states, "after layer_norm")
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        detect_overflow(forwarded_states, "after DenseReluDense")
-        return hidden_states + self.dropout(forwarded_states)
-```
-
-You can see that we added 2 of these and now we can know the absolute largest numbers for `forwarded_states` at 2 different stages.
diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
new file mode 100644
index 00000000000000..2a8f1451a96645
--- /dev/null
+++ b/docs/source/debugging.rst
@@ -0,0 +1,127 @@
+..
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+
+
+Debugging
+=========
+
+Activations Overflow
+--------------------
+
+.. note::
+
+   This feature is currently available for PyTorch-only.
+
+If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf``s or ``nan``s one needs to discover where the first overflow happens and what led to it. Luckily you can accomplish that easily by activating a special module that will do the detection automatically.
+
+If you're using :class:`~transformers.Trainer`, you just need to add:
+
+.. code-block:: bash
+
+   --debug activation_overflow
+
+to the normal command line arguments, or pass ``debug="activation_overflow"`` when creating the :class:`~transformers.Trainer` object.
+
+If you're using your own trainer you can just do:
+
+.. code-block:: python
+
+   from .debug_utils import DebugActivationOverflow
+   debug_overflow = DebugActivationOverflow(model)
+
+``DebugActivationOverflow`` inserts hooks into the model that will test each input and output and as soon as ``inf`` or ``nan`` is detected in at least one element, the program will assert and print a report like this:
+
+.. code-block::
+
+   < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output has infs
+
+
+   last 40 frames:
+   abs_max= 5.96e+02 < [0] encoder.block.1.layer.1.DenseReluDense.dropout: Dropout: output
+   abs_max= 5.96e+02 > [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: input[0]
+   abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: output
+   abs_max= 2.57e+00 > [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: input[0]
+   abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: output
+   abs_max= 3.17e+03 > [0] encoder.block.1.layer.1.dropout: Dropout: input[0]
+   abs_max= 3.52e+03 < [0] encoder.block.1.layer.1.dropout: Dropout: output
+   abs_max= 1.58e+03 > [0] encoder.block.1.layer.1: T5LayerFF: input[0]
+   abs_max= 4.04e+03 < [0] encoder.block.1.layer.1: T5LayerFF: output
+   abs_max= 1.51e+03 > [0] encoder.block.1: T5Block: input[0]
+   abs_max= 4.04e+03 < [0] encoder.block.1: T5Block: output[0]
+   abs_max= 1.00e+04 < [0] encoder.block.1: T5Block: output[2]
+   abs_max= 4.04e+03 > [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: input[0]
+   abs_max= 2.69e+00 < [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: output
+   abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.q: Linear: input[0]
+   abs_max= 1.13e+00 < [0] encoder.block.2.layer.0.SelfAttention.q: Linear: output
+   abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.k: Linear: input[0]
+   abs_max= 1.69e+01 < [0] encoder.block.2.layer.0.SelfAttention.k: Linear: output
+   abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.v: Linear: input[0]
+   abs_max= 8.92e+00 < [0] encoder.block.2.layer.0.SelfAttention.v: Linear: output
+   abs_max= 7.59e+00 > [0] encoder.block.2.layer.0.SelfAttention.o: Linear: input[0]
+   abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention.o: Linear: output
+   abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention: T5Attention: input[0]
+   abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[0]
+   abs_max= 1.00e+04 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[2]
+   abs_max= 2.83e+02 > [0] encoder.block.2.layer.0.dropout: Dropout: input[0]
+   abs_max= 3.14e+02 < [0] encoder.block.2.layer.0.dropout: Dropout: output
+   abs_max= 4.04e+03 > [0] encoder.block.2.layer.0: T5LayerSelfAttention: input[0]
+   abs_max= 4.06e+03 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[0]
+   abs_max= 1.00e+04 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[2]
+   abs_max= 4.06e+03 > [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: input[0]
+   abs_max= 6.00e+00 < [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: output
+   abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: input[0]
+   abs_max= 5.18e+01 < [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: output
+   abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: input[0]
+   abs_max= 3.14e+02 < [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: output
+   abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
+   abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
+   abs_max= 1.80e+04 > [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: input[0]
+   abs_max=      inf < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output
+
+The left column shows the value of the absolute largest element, so if you have a closer look the last few frames, the inputs and outputs were in the range of 10000. So when this training was done under mixed precision the very last step overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under ``fp16`` the activations must remain way below ``1e4``, because ``1e4*1e4 = 1e8`` so any matrix multiply with large activations is going to lead to overflow.
+
+The trace then prints the batch number (here ``[0]`` means the problem occurred on the first batch).
+
+Then comes the fully qualified entry from the ``state_dict``, e.g.: ``encoder.block.2.layer.0.layer_norm``, so you can easily see where the problem happens and what was happening just before it.
+
+The second to last entry show the name of the class the ``forward`` belongs to, and whether the report is for an input or an output and its index if either is a tuple. Only tensor variables are reported.
+
+Another shortcut in the first columns ``>`` is for input variable, ``<`` is for output.
+
+Let's look at:
+
+.. code-block::
+
+   abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
+   abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
+
+This is a report for ``Dropout.forward`` function with the first entry for the only input and the second for the only output. You can see that it was called from an attribute ``dropout`` inside ``DenseReluDense`` class. We can see that it happened during the first layer, of the 2nd block, during the very first batch. Finally the absolute largest input elements was ``1.62e+04`` and same for the output was ``1.80e+04``.
+
+Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers started to go up and most likely switch to the ``fp32`` mode here, so that the numbers don't overflow when multiplied or summed up. Of course, there might be other solutions.
+
+Since the automatic detector only reports inputs and outputs, once you know where to look, you may want to analyse the intermediary stages of ``forward`` as well. In such a case you can use the helper function to inject the detector where you want it, for example:
+
+.. code-block::
+
+   from debug_utils import detect_overflow
+
+   class T5LayerFF(nn.Module):
+       [...]
+       def forward(self, hidden_states):
+           forwarded_states = self.layer_norm(hidden_states)
+           detect_overflow(forwarded_states, "after layer_norm")
+           forwarded_states = self.DenseReluDense(forwarded_states)
+           detect_overflow(forwarded_states, "after DenseReluDense")
+           return hidden_states + self.dropout(forwarded_states)
+
+You can see that we added 2 of these and now we can know the absolute largest numbers for ``forwarded_states`` at 2 different stages.

From ad88c61b652a41850201871a1b343e9b4e9d7a6c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 16 Apr 2021 10:24:53 -0700
Subject: [PATCH 09/20] add class

---
 docs/source/internal/trainer_utils.rst | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/source/internal/trainer_utils.rst b/docs/source/internal/trainer_utils.rst
index c649eb3ab4e4ff..9229ba595e1103 100644
--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@@ -1,4 +1,4 @@
-.. 
+..
     Copyright 2020 The HuggingFace Team. All rights reserved.
 
     Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -46,3 +46,9 @@ Distributed Evaluation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.HfArgumentParser
+
+
+Debug Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.debug_utils.DebugActivationOverflow

From 73379222a8e8ccc192e5e7c38dcd91795d79197e Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 16 Apr 2021 10:29:06 -0700
Subject: [PATCH 10/20] fix docs

---
 docs/source/debugging.rst       | 160 ++++++++++++++++++--------------
 src/transformers/debug_utils.py |   2 +-
 2 files changed, 90 insertions(+), 72 deletions(-)

diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
index 2a8f1451a96645..3830ebc1a20655 100644
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -13,88 +13,98 @@
 
 
 Debugging
-=========
+=======================================================================================================================
 
 Activations Overflow
---------------------
+-----------------------------------------------------------------------------------------------------------------------
 
 .. note::
 
    This feature is currently available for PyTorch-only.
 
-If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf``s or ``nan``s one needs to discover where the first overflow happens and what led to it. Luckily you can accomplish that easily by activating a special module that will do the detection automatically.
+If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` one
+needs to discover where the first overflow happens and what led to it. Luckily you can accomplish that easily by
+activating a special module that will do the detection automatically.
 
 If you're using :class:`~transformers.Trainer`, you just need to add:
 
 .. code-block:: bash
 
-   --debug activation_overflow
+    --debug activation_overflow
 
-to the normal command line arguments, or pass ``debug="activation_overflow"`` when creating the :class:`~transformers.Trainer` object.
+to the normal command line arguments, or pass ``debug="activation_overflow"`` when creating the
+:class:`~transformers.Trainer` object.
 
 If you're using your own trainer you can just do:
 
 .. code-block:: python
 
-   from .debug_utils import DebugActivationOverflow
-   debug_overflow = DebugActivationOverflow(model)
+    from .debug_utils import DebugActivationOverflow
+    debug_overflow = DebugActivationOverflow(model)
 
-``DebugActivationOverflow`` inserts hooks into the model that will test each input and output and as soon as ``inf`` or ``nan`` is detected in at least one element, the program will assert and print a report like this:
+``DebugActivationOverflow`` inserts hooks into the model that will test each input and output and as soon as ``inf`` or
+``nan`` is detected in at least one element, the program will assert and print a report like this:
 
 .. code-block::
 
-   < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output has infs
-
-
-   last 40 frames:
-   abs_max= 5.96e+02 < [0] encoder.block.1.layer.1.DenseReluDense.dropout: Dropout: output
-   abs_max= 5.96e+02 > [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: input[0]
-   abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: output
-   abs_max= 2.57e+00 > [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: input[0]
-   abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: output
-   abs_max= 3.17e+03 > [0] encoder.block.1.layer.1.dropout: Dropout: input[0]
-   abs_max= 3.52e+03 < [0] encoder.block.1.layer.1.dropout: Dropout: output
-   abs_max= 1.58e+03 > [0] encoder.block.1.layer.1: T5LayerFF: input[0]
-   abs_max= 4.04e+03 < [0] encoder.block.1.layer.1: T5LayerFF: output
-   abs_max= 1.51e+03 > [0] encoder.block.1: T5Block: input[0]
-   abs_max= 4.04e+03 < [0] encoder.block.1: T5Block: output[0]
-   abs_max= 1.00e+04 < [0] encoder.block.1: T5Block: output[2]
-   abs_max= 4.04e+03 > [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: input[0]
-   abs_max= 2.69e+00 < [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: output
-   abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.q: Linear: input[0]
-   abs_max= 1.13e+00 < [0] encoder.block.2.layer.0.SelfAttention.q: Linear: output
-   abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.k: Linear: input[0]
-   abs_max= 1.69e+01 < [0] encoder.block.2.layer.0.SelfAttention.k: Linear: output
-   abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.v: Linear: input[0]
-   abs_max= 8.92e+00 < [0] encoder.block.2.layer.0.SelfAttention.v: Linear: output
-   abs_max= 7.59e+00 > [0] encoder.block.2.layer.0.SelfAttention.o: Linear: input[0]
-   abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention.o: Linear: output
-   abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention: T5Attention: input[0]
-   abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[0]
-   abs_max= 1.00e+04 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[2]
-   abs_max= 2.83e+02 > [0] encoder.block.2.layer.0.dropout: Dropout: input[0]
-   abs_max= 3.14e+02 < [0] encoder.block.2.layer.0.dropout: Dropout: output
-   abs_max= 4.04e+03 > [0] encoder.block.2.layer.0: T5LayerSelfAttention: input[0]
-   abs_max= 4.06e+03 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[0]
-   abs_max= 1.00e+04 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[2]
-   abs_max= 4.06e+03 > [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: input[0]
-   abs_max= 6.00e+00 < [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: output
-   abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: input[0]
-   abs_max= 5.18e+01 < [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: output
-   abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: input[0]
-   abs_max= 3.14e+02 < [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: output
-   abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
-   abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
-   abs_max= 1.80e+04 > [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: input[0]
-   abs_max=      inf < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output
-
-The left column shows the value of the absolute largest element, so if you have a closer look the last few frames, the inputs and outputs were in the range of 10000. So when this training was done under mixed precision the very last step overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under ``fp16`` the activations must remain way below ``1e4``, because ``1e4*1e4 = 1e8`` so any matrix multiply with large activations is going to lead to overflow.
+    < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output has infs
+
+
+    last 40 frames:
+    abs_max= 5.96e+02 < [0] encoder.block.1.layer.1.DenseReluDense.dropout: Dropout: output
+    abs_max= 5.96e+02 > [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: input[0]
+    abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: output
+    abs_max= 2.57e+00 > [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: input[0]
+    abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: output
+    abs_max= 3.17e+03 > [0] encoder.block.1.layer.1.dropout: Dropout: input[0]
+    abs_max= 3.52e+03 < [0] encoder.block.1.layer.1.dropout: Dropout: output
+    abs_max= 1.58e+03 > [0] encoder.block.1.layer.1: T5LayerFF: input[0]
+    abs_max= 4.04e+03 < [0] encoder.block.1.layer.1: T5LayerFF: output
+    abs_max= 1.51e+03 > [0] encoder.block.1: T5Block: input[0]
+    abs_max= 4.04e+03 < [0] encoder.block.1: T5Block: output[0]
+    abs_max= 1.00e+04 < [0] encoder.block.1: T5Block: output[2]
+    abs_max= 4.04e+03 > [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: input[0]
+    abs_max= 2.69e+00 < [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: output
+    abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.q: Linear: input[0]
+    abs_max= 1.13e+00 < [0] encoder.block.2.layer.0.SelfAttention.q: Linear: output
+    abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.k: Linear: input[0]
+    abs_max= 1.69e+01 < [0] encoder.block.2.layer.0.SelfAttention.k: Linear: output
+    abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.v: Linear: input[0]
+    abs_max= 8.92e+00 < [0] encoder.block.2.layer.0.SelfAttention.v: Linear: output
+    abs_max= 7.59e+00 > [0] encoder.block.2.layer.0.SelfAttention.o: Linear: input[0]
+    abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention.o: Linear: output
+    abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention: T5Attention: input[0]
+    abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[0]
+    abs_max= 1.00e+04 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[2]
+    abs_max= 2.83e+02 > [0] encoder.block.2.layer.0.dropout: Dropout: input[0]
+    abs_max= 3.14e+02 < [0] encoder.block.2.layer.0.dropout: Dropout: output
+    abs_max= 4.04e+03 > [0] encoder.block.2.layer.0: T5LayerSelfAttention: input[0]
+    abs_max= 4.06e+03 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[0]
+    abs_max= 1.00e+04 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[2]
+    abs_max= 4.06e+03 > [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: input[0]
+    abs_max= 6.00e+00 < [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: output
+    abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: input[0]
+    abs_max= 5.18e+01 < [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: output
+    abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: input[0]
+    abs_max= 3.14e+02 < [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: output
+    abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
+    abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
+    abs_max= 1.80e+04 > [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: input[0]
+    abs_max=      inf < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output
+
+The left column shows the value of the absolute largest element, so if you have a closer look the last few frames, the
+inputs and outputs were in the range of 10000. So when this training was done under mixed precision the very last step
+overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under ``fp16`` the
+activations must remain way below ``1e4``, because ``1e4*1e4 = 1e8`` so any matrix multiply with large activations is
+going to lead to overflow.
 
 The trace then prints the batch number (here ``[0]`` means the problem occurred on the first batch).
 
-Then comes the fully qualified entry from the ``state_dict``, e.g.: ``encoder.block.2.layer.0.layer_norm``, so you can easily see where the problem happens and what was happening just before it.
+Then comes the fully qualified entry from the ``state_dict``, e.g.: ``encoder.block.2.layer.0.layer_norm``, so you can
+easily see where the problem happens and what was happening just before it.
 
-The second to last entry show the name of the class the ``forward`` belongs to, and whether the report is for an input or an output and its index if either is a tuple. Only tensor variables are reported.
+The second to last entry show the name of the class the ``forward`` belongs to, and whether the report is for an input
+or an output and its index if either is a tuple. Only tensor variables are reported.
 
 Another shortcut in the first columns ``>`` is for input variable, ``<`` is for output.
 
@@ -102,26 +112,34 @@ Let's look at:
 
 .. code-block::
 
-   abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
-   abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
+    abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
+    abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
 
-This is a report for ``Dropout.forward`` function with the first entry for the only input and the second for the only output. You can see that it was called from an attribute ``dropout`` inside ``DenseReluDense`` class. We can see that it happened during the first layer, of the 2nd block, during the very first batch. Finally the absolute largest input elements was ``1.62e+04`` and same for the output was ``1.80e+04``.
+This is a report for ``Dropout.forward`` function with the first entry for the only input and the second for the only
+output. You can see that it was called from an attribute ``dropout`` inside ``DenseReluDense`` class. We can see that
+it happened during the first layer, of the 2nd block, during the very first batch. Finally the absolute largest input
+elements was ``1.62e+04`` and same for the output was ``1.80e+04``.
 
-Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers started to go up and most likely switch to the ``fp32`` mode here, so that the numbers don't overflow when multiplied or summed up. Of course, there might be other solutions.
+Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
+started to go up and most likely switch to the ``fp32`` mode here, so that the numbers don't overflow when multiplied
+or summed up. Of course, there might be other solutions.
 
-Since the automatic detector only reports inputs and outputs, once you know where to look, you may want to analyse the intermediary stages of ``forward`` as well. In such a case you can use the helper function to inject the detector where you want it, for example:
+Since the automatic detector only reports inputs and outputs, once you know where to look, you may want to analyse the
+intermediary stages of ``forward`` as well. In such a case you can use the helper function to inject the detector where
+you want it, for example:
 
 .. code-block::
 
-   from debug_utils import detect_overflow
+    from debug_utils import detect_overflow
 
-   class T5LayerFF(nn.Module):
-       [...]
-       def forward(self, hidden_states):
-           forwarded_states = self.layer_norm(hidden_states)
-           detect_overflow(forwarded_states, "after layer_norm")
-           forwarded_states = self.DenseReluDense(forwarded_states)
-           detect_overflow(forwarded_states, "after DenseReluDense")
-           return hidden_states + self.dropout(forwarded_states)
+    class T5LayerFF(nn.Module):
+        [...]
+        def forward(self, hidden_states):
+            forwarded_states = self.layer_norm(hidden_states)
+            detect_overflow(forwarded_states, "after layer_norm")
+            forwarded_states = self.DenseReluDense(forwarded_states)
+            detect_overflow(forwarded_states, "after DenseReluDense")
+            return hidden_states + self.dropout(forwarded_states)
 
-You can see that we added 2 of these and now we can know the absolute largest numbers for ``forwarded_states`` at 2 different stages.
+You can see that we added 2 of these and now we can know the absolute largest numbers for ``forwarded_states`` at 2
+different stages.
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index c247107544c0e0..9aebeca4c863c2 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -27,7 +27,7 @@
 
 class DebugActivationOverflow:
     """
-    This debug class helps detect and understand where the model starts getting ``nan``s or ``inf``s in activation
+    This debug class helps detect and understand where the model starts getting ``nan`` or ``inf`` in activation
     elements.
 
     To activate, initialize the object with the model ::

From 832b49662bc8ec4943bf91d77b4ccd26f9e6b5da Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 16 Apr 2021 10:42:42 -0700
Subject: [PATCH 11/20] improve the doc

---
 docs/source/debugging.rst | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
index 3830ebc1a20655..b88c898fae8ab8 100644
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -22,9 +22,9 @@ Activations Overflow
 
    This feature is currently available for PyTorch-only.
 
-If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` one
-needs to discover where the first overflow happens and what led to it. Luckily you can accomplish that easily by
-activating a special module that will do the detection automatically.
+If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
+activations one needs to discover where the first overflow happens and what led to it. Luckily you can accomplish that
+easily by activating a special module that will do the detection automatically.
 
 If you're using :class:`~transformers.Trainer`, you just need to add:
 
@@ -35,15 +35,16 @@ If you're using :class:`~transformers.Trainer`, you just need to add:
 to the normal command line arguments, or pass ``debug="activation_overflow"`` when creating the
 :class:`~transformers.Trainer` object.
 
-If you're using your own trainer you can just do:
+If you're using your own trainer you can accomplish the same with:
 
 .. code-block:: python
 
     from .debug_utils import DebugActivationOverflow
     debug_overflow = DebugActivationOverflow(model)
 
-``DebugActivationOverflow`` inserts hooks into the model that will test each input and output and as soon as ``inf`` or
-``nan`` is detected in at least one element, the program will assert and print a report like this:
+:class:`~transformers.debug_utils.DebugActivationOverflow` inserts hooks into the model that will test each input and
+output and as soon as ``inf`` or ``nan`` is detected in at least one element of the activations, the program will
+assert and print a report like this:
 
 .. code-block::
 
@@ -92,21 +93,21 @@ If you're using your own trainer you can just do:
     abs_max= 1.80e+04 > [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: input[0]
     abs_max=      inf < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output
 
-The left column shows the value of the absolute largest element, so if you have a closer look the last few frames, the
-inputs and outputs were in the range of 10000. So when this training was done under mixed precision the very last step
-overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under ``fp16`` the
-activations must remain way below ``1e4``, because ``1e4*1e4 = 1e8`` so any matrix multiply with large activations is
-going to lead to overflow.
+The left column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
+the inputs and outputs were in the range of ``1e4``. So when this training was done under mixed precision the very last
+step overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under ``fp16``
+the activations must remain way below ``1e4``, because ``1e4 * 1e4 = 1e8`` so any matrix multiplication with large
+activations is going to lead to a numerical overflow condition.
 
 The trace then prints the batch number (here ``[0]`` means the problem occurred on the first batch).
 
 Then comes the fully qualified entry from the ``state_dict``, e.g.: ``encoder.block.2.layer.0.layer_norm``, so you can
 easily see where the problem happens and what was happening just before it.
 
-The second to last entry show the name of the class the ``forward`` belongs to, and whether the report is for an input
-or an output and its index if either is a tuple. Only tensor variables are reported.
+The second to last entry shows the name of the class the ``forward`` belongs to, and also whether the report is for an
+input or an output and its index if either is a tuple. Only tensor variables are reported.
 
-Another shortcut in the first columns ``>`` is for input variable, ``<`` is for output.
+Another shortcut in the first columns: ``>`` designates an input variable, and ``<`` for output.
 
 Let's look at:
 
@@ -125,8 +126,8 @@ started to go up and most likely switch to the ``fp32`` mode here, so that the n
 or summed up. Of course, there might be other solutions.
 
 Since the automatic detector only reports inputs and outputs, once you know where to look, you may want to analyse the
-intermediary stages of ``forward`` as well. In such a case you can use the helper function to inject the detector where
-you want it, for example:
+intermediary stages of ``forward`` as well. In such a case you can use the ``detect_overflow`` helper function to
+inject the detector where you want it, for example:
 
 .. code-block::
 

From 8562c46a499d13eae1fd96e4a254c707cce2a9c9 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 19 Apr 2021 12:57:54 -0700
Subject: [PATCH 12/20] rework to dump a lot more info about each frame

---
 src/transformers/debug_utils.py | 92 ++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 37 deletions(-)

diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 9aebeca4c863c2..91ba37dcf0c78a 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -46,32 +46,38 @@ class DebugActivationOverflow:
     Args:
         model (:obj:`nn.Module`):
             The model to debug.
-        max_frames_to_save (:obj:`int`, `optional`, defaults to 40):
-            How many variables and their frames to record back - a few dozens is a good number.
+        max_frames_to_save (:obj:`int`, `optional`, defaults to 21):
+            How many frames back to record - a few dozens is a good number.
     """
 
-    def __init__(self, model, max_frames_to_save=40):
+    def __init__(self, model, max_frames_to_save=21):
         self.model = model
 
         # keep a LIFO buffer of frames to dump as soon as inf/nan is encountered to give context to the problem emergence
         self.frames = collections.deque([], max_frames_to_save)
-        self.save_frames = True
-        self.step = 0
+        self.frame = []
+        self.batch_number = 0
+        self.detected_overflow = False
 
         self.analyse_model()
 
         self.register_forward_hook()
 
-    def save_frame(self, frame):
-        self.frames.append(frame)
+    def save_frame(self, frame=None):
+        if frame is not None:
+            self.expand_frame(frame)
+        self.frames.append("\n".join(self.frame))
+        self.frame = []  # start a new frame
 
-    def dump_saved_frames_once(self):
-        # dump the previous frames only once (to help debug)
-        if self.save_frames:
-            print(f"\n\nlast {len(self.frames)} frames:")
-            print("\n".join(self.frames))
-            print("\n\n")
-            self.save_frames = False
+    def expand_frame(self, line):
+        self.frame.append(line)
+
+    def dump_saved_frames(self):
+        print(f"\n\nDetected inf/nan during batch_number={self.batch_number}")
+        print(f"last {len(self.frames)} frames:")
+        print(f"{'abs min':8} {'abs max':8} metadata")
+        print("\n".join(self.frames))
+        print("\n\n")
 
     def analyse_model(self):
         # extract the fully qualified module names, to be able to report at run time. e.g.:
@@ -79,20 +85,13 @@ def analyse_model(self):
         #
         # for shared weights only the first shared module name will be registered
         self.module_names = {m: name for name, m in self.model.named_modules()}
+        self.longest_module_name = max(len(v) for v in self.module_names.values())
 
     def analyse_variable(self, var, ctx):
         if torch.is_tensor(var):
-            if self.save_frames:
-                self.save_frame(get_abs_max(var, ctx))
-
+            self.expand_frame(get_abs_min_max(var, ctx))
             if detect_overflow(var, ctx):
-                self.dump_saved_frames_once()
-
-                # now we can die, as it's pointless to continue running
-                raise ValueError(
-                    "DebugActivationOverflow: inf/nan detected, aborting as there is no point running further. "
-                    "Please scroll up above this traceback to see the activation values prior to this event."
-                )
+                self.detected_overflow = True
 
     def register_forward_hook(self):
         self.model.apply(self._register_forward_hook)
@@ -104,34 +103,53 @@ def forward_hook(self, module, input, output):
         # - input is a tuple of packed inputs (could be non-Tensors)
         # - output could be a Tensor or a tuple of Tensors and non-Tensors
 
-        # count at which step we are (batch number)
+        prefix = "                 "
+
+        # count batch numbers
         if module == self.model:
-            self.step += 1
+            self.batch_number += 1
+            self.expand_frame(f"{prefix} Start batch_number={self.batch_number}")
 
-        ctx = f"[{self.step}] {self.module_names[module]}: {module.__class__.__name__}"
+        self.expand_frame(f"{prefix} {self.module_names[module]} {module.__class__.__name__}")
 
-        for i, x in enumerate(input):
-            self.analyse_variable(x, f"> {ctx}: input[{i}]")
+        # params
+        for name, p in module.named_parameters(recurse=False):
+            self.analyse_variable(p, name)
 
+        # inputs
+        if len(input) > 1:
+            for i, x in enumerate(input):
+                self.analyse_variable(x, f"input[{i}]")
+        else:
+            self.analyse_variable(input[0], "input")
+
+        # outputs
         if isinstance(output, tuple):
             for i, x in enumerate(output):
                 # possibly a tuple of tuples
                 if isinstance(x, tuple):
                     for j, y in enumerate(x):
-                        self.analyse_variable(y, f"< {ctx}: output[{i}][{j}]")
+                        self.analyse_variable(y, f"output[{i}][{j}]")
                 else:
-                    self.analyse_variable(x, f"< {ctx}: output[{i}]")
+                    self.analyse_variable(x, f"output[{i}]")
         else:
-            self.analyse_variable(output, f"< {ctx}: output")
+            self.analyse_variable(output, "output")
+
+        self.save_frame()
 
+        if self.detected_overflow:
+            self.dump_saved_frames()
 
-def get_abs_max(var, ctx):
-    abs_max = max(abs(var.min()), abs(var.max()))
-    return f"abs_max={abs_max:9.2e} {ctx}"
+            # now we can die, as it's pointless to continue running
+            raise ValueError(
+                "DebugActivationOverflow: inf/nan detected, aborting as there is no point running further. "
+                "Please scroll up above this traceback to see the activation values prior to this event."
+            )
 
 
-def get_min_max(var, ctx):
-    return f"min={var.min():9.2e} max={var.max():9.2e} {ctx}"
+def get_abs_min_max(var, ctx):
+    abs_var = var.abs()
+    return f"{abs_var.min():8.2e} {abs_var.max():8.2e} {ctx}"
 
 
 def detect_overflow(var, ctx):

From ef1cd0193f61f5417f86580c75a8c1e228ec4abe Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 29 Apr 2021 15:51:03 -0700
Subject: [PATCH 13/20] complete expansion

---
 docs/source/debugging.rst                     | 315 +++++++++++++-----
 docs/source/internal/trainer_utils.rst        |   2 +-
 .../legacy/seq2seq/run_distributed_eval.py    |   3 +
 src/transformers/debug_utils.py               | 205 +++++++++---
 src/transformers/trainer.py                   |   6 +-
 src/transformers/training_args.py             |   5 +-
 6 files changed, 404 insertions(+), 132 deletions(-)

diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
index b88c898fae8ab8..3f9d15d6f03cec 100644
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -15,7 +15,7 @@
 Debugging
 =======================================================================================================================
 
-Activations Overflow
+Underflow and Overflow Detection
 -----------------------------------------------------------------------------------------------------------------------
 
 .. note::
@@ -23,113 +23,188 @@ Activations Overflow
    This feature is currently available for PyTorch-only.
 
 If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
-activations one needs to discover where the first overflow happens and what led to it. Luckily you can accomplish that
-easily by activating a special module that will do the detection automatically.
+activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
+you can accomplish that easily by activating a special module that will do the detection automatically.
 
 If you're using :class:`~transformers.Trainer`, you just need to add:
 
 .. code-block:: bash
 
-    --debug activation_overflow
+    --debug underflow_overflow
 
-to the normal command line arguments, or pass ``debug="activation_overflow"`` when creating the
+to the normal command line arguments, or pass ``debug="underflow_overflow"`` when creating the
 :class:`~transformers.Trainer` object.
 
 If you're using your own trainer you can accomplish the same with:
 
 .. code-block:: python
 
-    from .debug_utils import DebugActivationOverflow
-    debug_overflow = DebugActivationOverflow(model)
+    from .debug_utils import DebugUnderflowOverflow
+    debug_overflow = DebugUnderflowOverflow(model)
 
-:class:`~transformers.debug_utils.DebugActivationOverflow` inserts hooks into the model that will test each input and
-output and as soon as ``inf`` or ``nan`` is detected in at least one element of the activations, the program will
-assert and print a report like this:
+:class:`~transformers.debug_utils.DebugUnderflowOverflow` inserts hooks into the model that immediately after each
+forward call will test input and output variables and also the corresponding module's weights. As soon as ``inf`` or
+``nan`` is detected in at least one element of the activations or weights, the program will assert and print a report
+like this (this was caught with ``google/mt5-small`` under fp16 mixed precision):
 
 .. code-block::
 
-    < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output has infs
-
-
-    last 40 frames:
-    abs_max= 5.96e+02 < [0] encoder.block.1.layer.1.DenseReluDense.dropout: Dropout: output
-    abs_max= 5.96e+02 > [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: input[0]
-    abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense.wo: Linear: output
-    abs_max= 2.57e+00 > [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: input[0]
-    abs_max= 3.17e+03 < [0] encoder.block.1.layer.1.DenseReluDense: T5DenseGatedGeluDense: output
-    abs_max= 3.17e+03 > [0] encoder.block.1.layer.1.dropout: Dropout: input[0]
-    abs_max= 3.52e+03 < [0] encoder.block.1.layer.1.dropout: Dropout: output
-    abs_max= 1.58e+03 > [0] encoder.block.1.layer.1: T5LayerFF: input[0]
-    abs_max= 4.04e+03 < [0] encoder.block.1.layer.1: T5LayerFF: output
-    abs_max= 1.51e+03 > [0] encoder.block.1: T5Block: input[0]
-    abs_max= 4.04e+03 < [0] encoder.block.1: T5Block: output[0]
-    abs_max= 1.00e+04 < [0] encoder.block.1: T5Block: output[2]
-    abs_max= 4.04e+03 > [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: input[0]
-    abs_max= 2.69e+00 < [0] encoder.block.2.layer.0.layer_norm: T5LayerNorm: output
-    abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.q: Linear: input[0]
-    abs_max= 1.13e+00 < [0] encoder.block.2.layer.0.SelfAttention.q: Linear: output
-    abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.k: Linear: input[0]
-    abs_max= 1.69e+01 < [0] encoder.block.2.layer.0.SelfAttention.k: Linear: output
-    abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention.v: Linear: input[0]
-    abs_max= 8.92e+00 < [0] encoder.block.2.layer.0.SelfAttention.v: Linear: output
-    abs_max= 7.59e+00 > [0] encoder.block.2.layer.0.SelfAttention.o: Linear: input[0]
-    abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention.o: Linear: output
-    abs_max= 2.69e+00 > [0] encoder.block.2.layer.0.SelfAttention: T5Attention: input[0]
-    abs_max= 2.83e+02 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[0]
-    abs_max= 1.00e+04 < [0] encoder.block.2.layer.0.SelfAttention: T5Attention: output[2]
-    abs_max= 2.83e+02 > [0] encoder.block.2.layer.0.dropout: Dropout: input[0]
-    abs_max= 3.14e+02 < [0] encoder.block.2.layer.0.dropout: Dropout: output
-    abs_max= 4.04e+03 > [0] encoder.block.2.layer.0: T5LayerSelfAttention: input[0]
-    abs_max= 4.06e+03 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[0]
-    abs_max= 1.00e+04 < [0] encoder.block.2.layer.0: T5LayerSelfAttention: output[2]
-    abs_max= 4.06e+03 > [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: input[0]
-    abs_max= 6.00e+00 < [0] encoder.block.2.layer.1.layer_norm: T5LayerNorm: output
-    abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: input[0]
-    abs_max= 5.18e+01 < [0] encoder.block.2.layer.1.DenseReluDense.wi_0: Linear: output
-    abs_max= 6.00e+00 > [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: input[0]
-    abs_max= 3.14e+02 < [0] encoder.block.2.layer.1.DenseReluDense.wi_1: Linear: output
-    abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
-    abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
-    abs_max= 1.80e+04 > [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: input[0]
-    abs_max=      inf < [0] encoder.block.2.layer.1.DenseReluDense.wo: Linear: output
-
-The left column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
-the inputs and outputs were in the range of ``1e4``. So when this training was done under mixed precision the very last
-step overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under ``fp16``
-the activations must remain way below ``1e4``, because ``1e4 * 1e4 = 1e8`` so any matrix multiplication with large
-activations is going to lead to a numerical overflow condition.
-
-The trace then prints the batch number (here ``[0]`` means the problem occurred on the first batch).
-
-Then comes the fully qualified entry from the ``state_dict``, e.g.: ``encoder.block.2.layer.0.layer_norm``, so you can
-easily see where the problem happens and what was happening just before it.
-
-The second to last entry shows the name of the class the ``forward`` belongs to, and also whether the report is for an
-input or an output and its index if either is a tuple. Only tensor variables are reported.
-
-Another shortcut in the first columns: ``>`` designates an input variable, and ``<`` for output.
-
-Let's look at:
+    Detected inf/nan during batch_number=0
+    Last 21 forward frames:
+    abs min  abs max  metadata
+                      encoder.block.1.layer.1.DenseReluDense.dropout Dropout
+    0.00e+00 2.57e+02 input[0]
+    0.00e+00 2.85e+02 output
+    [...]
+                      encoder.block.2.layer.0 T5LayerSelfAttention
+    6.78e-04 3.15e+03 input[0]
+    2.65e-04 3.42e+03 output[0]
+                 None output[1]
+    2.25e-01 1.00e+04 output[2]
+                      encoder.block.2.layer.1.layer_norm T5LayerNorm
+    8.69e-02 4.18e-01 weight
+    2.65e-04 3.42e+03 input[0]
+    1.79e-06 4.65e+00 output
+                      encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+    2.17e-07 4.50e+00 weight
+    1.79e-06 4.65e+00 input[0]
+    2.68e-06 3.70e+01 output
+                      encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+    8.08e-07 2.66e+01 weight
+    1.79e-06 4.65e+00 input[0]
+    1.27e-04 2.37e+02 output
+                      encoder.block.2.layer.1.DenseReluDense.dropout Dropout
+    0.00e+00 8.76e+03 input[0]
+    0.00e+00 9.74e+03 output
+                      encoder.block.2.layer.1.DenseReluDense.wo Linear
+    1.01e-06 6.44e+00 weight
+    0.00e+00 9.74e+03 input[0]
+    3.18e-04 6.27e+04 output
+                      encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+    1.79e-06 4.65e+00 input[0]
+    3.18e-04 6.27e+04 output
+                      encoder.block.2.layer.1.dropout Dropout
+    3.18e-04 6.27e+04 input[0]
+    0.00e+00      inf output
+
+The example output has been trimmed in the middle for brevity.
+
+The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
+the inputs and outputs were in the range of ``1e4``. So when this training was done under fp16 mixed precision the very
+last step overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under
+``fp16`` the activations must remain way below ``1e4``, because ``1e4 * 1e4 = 1e8`` so any matrix multiplication with
+large activations is going to lead to a numerical overflow condition.
+
+At the very start of the trace you can discover at which batch number the problem occurred (here ``Detected inf/nan
+during batch_number=0`` means the problem occurred on the first batch).
+
+Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
+for. If we look just at this frame:
 
 .. code-block::
 
-    abs_max= 1.62e+04 > [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: input[0]
-    abs_max= 1.80e+04 < [0] encoder.block.2.layer.1.DenseReluDense.dropout: Dropout: output
+                      encoder.block.2.layer.1.layer_norm T5LayerNorm
+    8.69e-02 4.18e-01 weight
+    2.65e-04 3.42e+03 input[0]
+    1.79e-06 4.65e+00 output
 
-This is a report for ``Dropout.forward`` function with the first entry for the only input and the second for the only
-output. You can see that it was called from an attribute ``dropout`` inside ``DenseReluDense`` class. We can see that
-it happened during the first layer, of the 2nd block, during the very first batch. Finally the absolute largest input
-elements was ``1.62e+04`` and same for the output was ``1.80e+04``.
+Here, ``encoder.block.2.layer.1.layer_norm`` indicates that it was a layer norm for the first layer, of the second
+block of the encoder. And the specific calls of the ``forward`` is ``T5LayerNorm``.
+
+Let's look at the last few frames of that report:
+
+.. code-block::
+
+        Detected inf/nan during batch_number=0
+        Last 21 forward frames:
+        abs min  abs max  metadata
+        [...]
+                          encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+        2.17e-07 4.50e+00 weight
+        1.79e-06 4.65e+00 input[0]
+        2.68e-06 3.70e+01 output
+                          encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+        8.08e-07 2.66e+01 weight
+        1.79e-06 4.65e+00 input[0]
+        1.27e-04 2.37e+02 output
+                          encoder.block.2.layer.1.DenseReluDense.wo Linear
+        1.01e-06 6.44e+00 weight
+        0.00e+00 9.74e+03 input[0]
+        3.18e-04 6.27e+04 output
+                          encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+        1.79e-06 4.65e+00 input[0]
+        3.18e-04 6.27e+04 output
+                          encoder.block.2.layer.1.dropout Dropout
+        3.18e-04 6.27e+04 input[0]
+        0.00e+00      inf output
+
+The last frame reports for ``Dropout.forward`` function with the first entry for the only input and the second for the
+only output. You can see that it was called from an attribute ``dropout`` inside ``DenseReluDense`` class. We can see
+that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
+input elements was ``6.27e+04`` and same for the output was ``inf``.
+
+You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value was
+around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which renormalizes
+the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
+overlow (``inf``).
+
+As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
+numbers.
+
+Let's match the report to the code from ``models/t5/modeling_t5.py``:
+
+.. code-block:: python
+
+    class T5DenseGatedGeluDense(nn.Module):
+        def __init__(self, config):
+            super().__init__()
+            self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+            self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+            self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+            self.dropout = nn.Dropout(config.dropout_rate)
+            self.gelu_act = ACT2FN["gelu_new"]
+
+        def forward(self, hidden_states):
+            hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+            hidden_linear = self.wi_1(hidden_states)
+            hidden_states = hidden_gelu * hidden_linear
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = self.wo(hidden_states)
+            return hidden_states
+
+Now it's easy to see the ``dropout`` call, and all the previous calls as well.
+
+Since the detection is happening in a forward hook, these reports are printed immediately after each ``forward``
+returns.
 
 Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
 started to go up and most likely switch to the ``fp32`` mode here, so that the numbers don't overflow when multiplied
-or summed up. Of course, there might be other solutions.
+or summed up. Of course, there might be other solutions. For example, we could turn off ``amp`` temporarily if it's
+enabled, after moving the original ``forward`` into a helper wrapper, like so:
 
-Since the automatic detector only reports inputs and outputs, once you know where to look, you may want to analyse the
-intermediary stages of ``forward`` as well. In such a case you can use the ``detect_overflow`` helper function to
-inject the detector where you want it, for example:
+.. code-block:: python
 
-.. code-block::
+    def _forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+    import torch
+    def forward(self, hidden_states):
+        if torch.is_autocast_enabled():
+             with torch.cuda.amp.autocast(enabled=False):
+                 return self._forward(hidden_states)
+         else:
+             return self._forward(hidden_states)
+
+Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
+want to analyse the intermediary stages of any specific ``forward`` function as well. In such a case you can use the
+``detect_overflow`` helper function to inject the detector where you want it, for example:
+
+.. code-block:: python
 
     from debug_utils import detect_overflow
 
@@ -142,5 +217,75 @@ inject the detector where you want it, for example:
             detect_overflow(forwarded_states, "after DenseReluDense")
             return hidden_states + self.dropout(forwarded_states)
 
-You can see that we added 2 of these and now we can know the absolute largest numbers for ``forwarded_states`` at 2
-different stages.
+You can see that we added 2 of these and now we track if ``inf`` or ``nan`` for ``forwarded_states`` was detected
+somewhere in between.
+
+Actually, the detector already reports these because each of the calls in the example above is a `nn.Module``, but
+let's say if you had some local direct calculations this is how you'd do that.
+
+Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
+its default, e.g.:
+
+.. code-block:: python
+
+    from .debug_utils import DebugUnderflowOverflow
+    debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+
+Specific batch absolute mix and max value tracing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
+
+Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a given
+batch, and only do that for batches 1 and 3. Then you instantiate this class as:
+
+.. code-block:: python
+
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+
+And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
+
+Batches are 0-indexed.
+
+This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
+right to that area. Here is a sample truncated output for such configuration:
+
+.. code-block::
+
+                      *** Starting batch number=1 ***
+    abs min  abs max  metadata
+                      shared Embedding
+    1.01e-06 7.92e+02 weight
+    0.00e+00 2.47e+04 input[0]
+    5.36e-05 7.92e+02 output
+    [...]
+                      decoder.dropout Dropout
+    1.60e-07 2.27e+01 input[0]
+    0.00e+00 2.52e+01 output
+                      decoder T5Stack
+         not a tensor output
+                      lm_head Linear
+    1.01e-06 7.92e+02 weight
+    0.00e+00 1.11e+00 input[0]
+    6.06e-02 8.39e+01 output
+                       T5ForConditionalGeneration
+         not a tensor output
+
+                      *** Starting batch number=3 ***
+    abs min  abs max  metadata
+                      shared Embedding
+    1.01e-06 7.92e+02 weight
+    0.00e+00 2.78e+04 input[0]
+    5.36e-05 7.92e+02 output
+    [...]
+
+Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
+not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
+a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
+numbers started to diverge.
+
+You can also specify the batch number after which to stop the training, with:
+
+.. code-block:: python
+
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
diff --git a/docs/source/internal/trainer_utils.rst b/docs/source/internal/trainer_utils.rst
index 9229ba595e1103..65720d15bafcc4 100644
--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@@ -51,4 +51,4 @@ Distributed Evaluation
 Debug Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.debug_utils.DebugActivationOverflow
+.. autoclass:: transformers.debug_utils.DebugUnderflowOverflow
diff --git a/examples/legacy/seq2seq/run_distributed_eval.py b/examples/legacy/seq2seq/run_distributed_eval.py
index 655807ba172ee0..8efaa3a7b76f71 100755
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@@ -68,6 +68,9 @@ def eval_data_dir(
     save_path = save_dir.joinpath(f"rank_{local_rank}_output.json")
     torch.cuda.set_device(local_rank)
     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()
+    from transformers.debug_utils import DebugUnderflowOverflow
+
+    DebugUnderflowOverflow(model)
     if fp16:
         model = model.half()
     # determine if we need to increase num_beams
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 91ba37dcf0c78a..159e583192ac07 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -25,39 +25,115 @@
 logger = logging.get_logger(__name__)
 
 
-class DebugActivationOverflow:
+class DebugUnderflowOverflow:
     """
-    This debug class helps detect and understand where the model starts getting ``nan`` or ``inf`` in activation
-    elements.
+    This debug class helps detect and understand where the model starts getting very large or very small, and more
+    importantly ``nan`` or ``inf`` weight and activation elements.
 
-    To activate, initialize the object with the model ::
+    There are 2 working modes:
 
-        debug_overflow = DebugActivationOverflow(model)
+    1. Underflow/overflow detection (default)
+    2. Specific batch absolute min/max tracing without detection
 
-    then run the training as normal and if any ``nan`` or ``inf`` get detected this module will throw an exception and
-    will print several dozens of frames that lead to this event, each line reporting:
+    Mode 1: Underflow/overflow detection
 
-    1. the absolute largest element of either input or output variable
-    2. the batch number
-    3. the fully qualified state_dict key of which element it was run for,
-    4. the class name whose ``forward`` was run
-    5. and finally whether it was an input or output and its index if it was a tuple.
+    To activate the underflow/overflow detection, initialize the object with the model ::
+
+        debug_overflow = DebugUnderflowOverflow(model)
+
+    then run the training as normal and if ``nan`` or ``inf`` gets detected in at least one of the weight, input or
+    output elements this module will throw an exception and will print ``max_frames_to_save`` frames that lead to this
+    event, each frame reporting
+
+    1. the fully qualified module name plus the class name whose ``forward`` was run
+    2. the absolute min and max value of all elements for each module weights, and the inputs and output
+
+    For example, here is the header and the last few frames in detection report for ``google/mt5-small`` run in fp16 mixed precision ::
+
+        Detected inf/nan during batch_number=0
+        Last 21 forward frames:
+        abs min  abs max  metadata
+        [...]
+                          encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+        2.17e-07 4.50e+00 weight
+        1.79e-06 4.65e+00 input[0]
+        2.68e-06 3.70e+01 output
+                          encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+        8.08e-07 2.66e+01 weight
+        1.79e-06 4.65e+00 input[0]
+        1.27e-04 2.37e+02 output
+                          encoder.block.2.layer.1.DenseReluDense.wo Linear
+        1.01e-06 6.44e+00 weight
+        0.00e+00 9.74e+03 input[0]
+        3.18e-04 6.27e+04 output
+                          encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+        1.79e-06 4.65e+00 input[0]
+        3.18e-04 6.27e+04 output
+                          encoder.block.2.layer.1.dropout Dropout
+        3.18e-04 6.27e+04 input[0]
+        0.00e+00      inf output
+
+    You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value
+    was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which
+    renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
+    64K, and we get an overlow.
+
+    As you can see it's the previous frames that we need to look into when the numbers start going into very large for
+    fp16 numbers.
+
+    The tracking is done in a forward hook, which gets invoked immediately after ``forward`` has completed.
+
+    By default the last 21 frames are printed. You can change the default to adjust for your needs. For example ::
+
+        debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+
+
+
+    Mode 2. Specific batch absolute min/max tracing without detection
+
+    The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
+
+    Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a given batch, and only do that for batches 1 and 3. Then you instantiate this class as ::
+
+        debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+
+    And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
+
+    This is helpful if you know that the program starts misbehaving after a certain batch number, so you can
+    fast-forward right to that area.
+
+
+
+    You can also specify the batch number after which to stop the training, with ::
+
+        debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+
+    This feature is mainly useful in the tracing mode, but you can use it for any more.
 
     Args:
         model (:obj:`nn.Module`):
             The model to debug.
         max_frames_to_save (:obj:`int`, `optional`, defaults to 21):
             How many frames back to record - a few dozens is a good number.
+        trace_batch_nums(:obj:`List(int)`, `optional`, defaults to []):
+            How many frames back to record - a few dozens is a good number.
+        abort_after_batch_num  (:obj:`int`, `optional`, defaults to None):
+            Whether to abort after a certain batch number has finished
+
     """
 
-    def __init__(self, model, max_frames_to_save=21):
+    def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None):
         self.model = model
+        self.trace_batch_nums = trace_batch_nums
+        self.abort_after_batch_num = abort_after_batch_num
 
         # keep a LIFO buffer of frames to dump as soon as inf/nan is encountered to give context to the problem emergence
         self.frames = collections.deque([], max_frames_to_save)
         self.frame = []
         self.batch_number = 0
+        self.total_calls = 0
         self.detected_overflow = False
+        self.prefix = "                 "
 
         self.analyse_model()
 
@@ -72,12 +148,20 @@ def save_frame(self, frame=None):
     def expand_frame(self, line):
         self.frame.append(line)
 
+    def trace_frames(self):
+        print("\n".join(self.frames))
+        self.frames = []
+
+    def reset_saved_frames(self):
+        self.frames = []
+
     def dump_saved_frames(self):
-        print(f"\n\nDetected inf/nan during batch_number={self.batch_number}")
-        print(f"last {len(self.frames)} frames:")
+        print(f"\nDetected inf/nan during batch_number={self.batch_number}")
+        print(f"Last {len(self.frames)} forward frames:")
         print(f"{'abs min':8} {'abs max':8} metadata")
         print("\n".join(self.frames))
         print("\n\n")
+        self.frames = []
 
     def analyse_model(self):
         # extract the fully qualified module names, to be able to report at run time. e.g.:
@@ -85,43 +169,38 @@ def analyse_model(self):
         #
         # for shared weights only the first shared module name will be registered
         self.module_names = {m: name for name, m in self.model.named_modules()}
-        self.longest_module_name = max(len(v) for v in self.module_names.values())
+        # self.longest_module_name = max(len(v) for v in self.module_names.values())
 
     def analyse_variable(self, var, ctx):
         if torch.is_tensor(var):
             self.expand_frame(get_abs_min_max(var, ctx))
             if detect_overflow(var, ctx):
                 self.detected_overflow = True
+        elif var is None:
+            self.expand_frame(f"{'None':>17} {ctx}")
+        else:
+            self.expand_frame(f"{'not a tensor':>17} {ctx}")
 
-    def register_forward_hook(self):
-        self.model.apply(self._register_forward_hook)
-
-    def _register_forward_hook(self, module):
-        module.register_forward_hook(self.forward_hook)
-
-    def forward_hook(self, module, input, output):
-        # - input is a tuple of packed inputs (could be non-Tensors)
-        # - output could be a Tensor or a tuple of Tensors and non-Tensors
-
-        prefix = "                 "
+    def batch_start_frame(self):
+        self.expand_frame(f"\n\n{self.prefix} *** Starting batch number={self.batch_number} ***")
+        self.expand_frame(f"{'abs min':8} {'abs max':8} metadata")
 
-        # count batch numbers
-        if module == self.model:
-            self.batch_number += 1
-            self.expand_frame(f"{prefix} Start batch_number={self.batch_number}")
+    def batch_end_frame(self):
+        self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number-1} ***\n\n")
 
-        self.expand_frame(f"{prefix} {self.module_names[module]} {module.__class__.__name__}")
+    def create_frame(self, module, input, output):
+        self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}")
 
         # params
         for name, p in module.named_parameters(recurse=False):
             self.analyse_variable(p, name)
 
         # inputs
-        if len(input) > 1:
+        if isinstance(input, tuple):
             for i, x in enumerate(input):
                 self.analyse_variable(x, f"input[{i}]")
         else:
-            self.analyse_variable(input[0], "input")
+            self.analyse_variable(input, "input")
 
         # outputs
         if isinstance(output, tuple):
@@ -137,15 +216,58 @@ def forward_hook(self, module, input, output):
 
         self.save_frame()
 
-        if self.detected_overflow:
+    def register_forward_hook(self):
+        self.model.apply(self._register_forward_hook)
+
+    def _register_forward_hook(self, module):
+        module.register_forward_hook(self.forward_hook)
+
+    def forward_hook(self, module, input, output):
+        # - input is a tuple of packed inputs (could be non-Tensors)
+        # - output could be a Tensor or a tuple of Tensors and non-Tensors
+
+        last_frame_of_batch = False
+
+        trace_mode = True if self.batch_number in self.trace_batch_nums else False
+        if trace_mode:
+            self.reset_saved_frames()
+
+        if self.total_calls == 0:
+            self.batch_start_frame()
+        self.total_calls += 1
+
+        # count batch numbers - the very first forward hook of the batch will be called when the
+        # batch completes - i.e. it gets called very last - we know this batch has finished
+        if module == self.model:
+            self.batch_number += 1
+            last_frame_of_batch = True
+
+        self.create_frame(module, input, output)
+
+        # if last_frame_of_batch:
+        #     self.batch_end_frame()
+
+        if trace_mode:
+            self.trace_frames()
+
+        if last_frame_of_batch:
+            self.batch_start_frame()
+
+        if self.detected_overflow and not trace_mode:
             self.dump_saved_frames()
 
-            # now we can die, as it's pointless to continue running
+            # now we can abort, as it's pointless to continue running
             raise ValueError(
-                "DebugActivationOverflow: inf/nan detected, aborting as there is no point running further. "
+                "DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. "
                 "Please scroll up above this traceback to see the activation values prior to this event."
             )
 
+        # abort after certain batch if requested to do so
+        if self.abort_after_batch_num is not None and self.batch_number > self.abort_after_batch_num:
+            raise ValueError(
+                f"DebugUnderflowOverflow: aborting after {self.batch_number} batches due to `abort_after_batch_num={self.abort_after_batch_num}` arg"
+            )
+
 
 def get_abs_min_max(var, ctx):
     abs_var = var.abs()
@@ -154,17 +276,20 @@ def get_abs_min_max(var, ctx):
 
 def detect_overflow(var, ctx):
     """
-    Report the count of ``nan`` and ``inf`` entries in the tensor.
+    Report of the tensor contains any ``nan`` and ``inf`` entries.
 
     This is useful for detecting overflows/underflows and best to call right after the function that did some math that
     modified the variable in question.
 
+    The function contains a few other helper features that you can enable and tweak directly if you want to track
+    various other things.
+
     Args:
         var: tensor variable to check
         ctx: the message to print as a context
 
     Return:
-        True if inf or nan was detected, False otherwise
+        True if ``inf`` or ``nan`` was detected, False otherwise
     """
     detected = False
     if torch.isnan(var).any().item():
@@ -196,5 +321,5 @@ def detect_overflow(var, ctx):
 
 
 class DebugOption(ExplicitEnum):
-    ACIVATION_OVERFLOW = "activation_overflow"
+    UNDERFLOW_OVERFLOW = "underflow_overflow"
     TPU_METRICS_DEBUG = "tpu_metrics_debug"
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 9a4b148a4c2b0a..eebea8b4a2dd72 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -59,7 +59,7 @@
 from . import __version__
 from .configuration_utils import PretrainedConfig
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
-from .debug_utils import DebugActivationOverflow, DebugOption
+from .debug_utils import DebugOption, DebugUnderflowOverflow
 from .dependency_versions_check import dep_version_check
 from .file_utils import (
     CONFIG_NAME,
@@ -1079,8 +1079,8 @@ def train(
             num_train_epochs = int(args.num_train_epochs)
             num_update_steps_per_epoch = max_steps
 
-        if DebugOption.ACIVATION_OVERFLOW in self.args.debug:
-            debug_overflow = DebugActivationOverflow(self.model)  # noqa
+        if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
+            debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
         delay_optimizer_creation = self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE
         if args.deepspeed:
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index dd05d8515da992..f6d6465ad72a6a 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -278,7 +278,7 @@ class TrainingArguments:
 
             Possible options are:
 
-            - :obj:`"activation_overflow"`: detects overflow in model's input/outputs and reports the last frames that
+            - :obj:`"underflow_overflow"`: detects overflow in model's input/outputs and reports the last frames that
               led to the event
             - :obj:`"tpu_metrics_debug"`: print debug metrics on TPU
 
@@ -449,7 +449,7 @@ class TrainingArguments:
         default="",
         metadata={
             "help": "Whether or not to enable debug mode. Current options: "
-            "`activation_overflow` (Detect overflow in activations), "
+            "`underflow_overflow` (Detect underflow and overflow in activations and weights), "
             "`tpu_metrics_debug` (print debug metrics on TPU)."
         },
     )
@@ -640,7 +640,6 @@ def __post_init__(self):
         elif ShardedDDPOption.ZERO_DP_2 in self.sharded_ddp and ShardedDDPOption.ZERO_DP_3 in self.sharded_ddp:
             raise ValueError("`--sharded_ddp zero_dp_2` is not compatible with `--sharded_ddp zero_dp_3`.")
 
-
         if self.tpu_metrics_debug:
             warnings.warn(
                 "using `--tpu_metrics_debug` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--debug tpu_metrics_debug` instead",

From 265a357df3741052002754a83816e4a7bc170cd4 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 29 Apr 2021 15:56:43 -0700
Subject: [PATCH 14/20] cleanup

---
 examples/legacy/seq2seq/run_distributed_eval.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/legacy/seq2seq/run_distributed_eval.py b/examples/legacy/seq2seq/run_distributed_eval.py
index 8efaa3a7b76f71..655807ba172ee0 100755
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@@ -68,9 +68,6 @@ def eval_data_dir(
     save_path = save_dir.joinpath(f"rank_{local_rank}_output.json")
     torch.cuda.set_device(local_rank)
     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()
-    from transformers.debug_utils import DebugUnderflowOverflow
-
-    DebugUnderflowOverflow(model)
     if fp16:
         model = model.half()
     # determine if we need to increase num_beams

From 931829adabe6039f1346e6c6d56a3ecea67298bd Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 29 Apr 2021 16:00:30 -0700
Subject: [PATCH 15/20] format

---
 src/transformers/debug_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 159e583192ac07..d7f85948d14146 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -115,9 +115,9 @@ class DebugUnderflowOverflow:
             The model to debug.
         max_frames_to_save (:obj:`int`, `optional`, defaults to 21):
             How many frames back to record - a few dozens is a good number.
-        trace_batch_nums(:obj:`List(int)`, `optional`, defaults to []):
+        trace_batch_nums(:obj:`List[int]`, `optional`, defaults to ``[]``):
             How many frames back to record - a few dozens is a good number.
-        abort_after_batch_num  (:obj:`int`, `optional`, defaults to None):
+        abort_after_batch_num  (:obj:`int`, `optional`, defaults to :obj:`None`):
             Whether to abort after a certain batch number has finished
 
     """

From 97aa9e76a51d6efe81f1971c5fb9f327aec408de Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 29 Apr 2021 16:01:25 -0700
Subject: [PATCH 16/20] cleanup

---
 src/transformers/debug_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index d7f85948d14146..ba05dfa5d20eb8 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -114,9 +114,9 @@ class DebugUnderflowOverflow:
         model (:obj:`nn.Module`):
             The model to debug.
         max_frames_to_save (:obj:`int`, `optional`, defaults to 21):
-            How many frames back to record - a few dozens is a good number.
+            How many frames back to record
         trace_batch_nums(:obj:`List[int]`, `optional`, defaults to ``[]``):
-            How many frames back to record - a few dozens is a good number.
+            Which batch numbers to trace (turns detection off)
         abort_after_batch_num  (:obj:`int`, `optional`, defaults to :obj:`None`):
             Whether to abort after a certain batch number has finished
 

From 19bb08d941bd3ee8984fb68048570b8ac167c7a3 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 29 Apr 2021 22:20:34 -0700
Subject: [PATCH 17/20] doesn't have to be transformers

---
 docs/source/debugging.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
index 3f9d15d6f03cec..26a6bf800ed43f 100644
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -22,6 +22,10 @@ Underflow and Overflow Detection
 
    This feature is currently available for PyTorch-only.
 
+.. note::
+
+   This feature can be used with any ``nn.Module``-based model
+
 If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
 activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
 you can accomplish that easily by activating a special module that will do the detection automatically.

From ff6ba6238457c67ceca176f42e385d017edad710 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 30 Apr 2021 10:11:39 -0700
Subject: [PATCH 18/20] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/debugging.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
index 26a6bf800ed43f..b13dc1a5e77746 100644
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -37,9 +37,9 @@ If you're using :class:`~transformers.Trainer`, you just need to add:
     --debug underflow_overflow
 
 to the normal command line arguments, or pass ``debug="underflow_overflow"`` when creating the
-:class:`~transformers.Trainer` object.
+:class:`~transformers.TrainingArguments` object.
 
-If you're using your own trainer you can accomplish the same with:
+If you're using your own training loop or another Trainer you can accomplish the same with:
 
 .. code-block:: python
 

From 8a81a557f3d643ab9dec7d7956bae5e2774001bf Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 30 Apr 2021 10:58:11 -0700
Subject: [PATCH 19/20] wrap long line

---
 src/transformers/debug_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index ba05dfa5d20eb8..c085a9e46a520e 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -93,7 +93,9 @@ class DebugUnderflowOverflow:
 
     The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
 
-    Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a given batch, and only do that for batches 1 and 3. Then you instantiate this class as ::
+    Let's say you want to watch the absolute min and max values for all the ingredients of each
+    ``forward`` call of a given batch, and only do that for batches 1 and 3. Then you instantiate
+    this class as ::
 
         debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
 

From b61c2531ee0c0fe6fc676d3c4d47dca0ab2fdb8c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 30 Apr 2021 11:03:21 -0700
Subject: [PATCH 20/20] style

---
 src/transformers/debug_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index c085a9e46a520e..45384a80134ba1 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -93,9 +93,8 @@ class DebugUnderflowOverflow:
 
     The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
 
-    Let's say you want to watch the absolute min and max values for all the ingredients of each
-    ``forward`` call of a given batch, and only do that for batches 1 and 3. Then you instantiate
-    this class as ::
+    Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a
+    given batch, and only do that for batches 1 and 3. Then you instantiate this class as ::
 
         debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])