From 12cfa52f5b313a9ae3ee8c31957a138846148547 Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Wed, 9 Feb 2022 15:47:41 +0100
Subject: [PATCH 01/10] WIP: class refactoring

---
 adapter_docs/classes/models/bart.rst          |  51 +---
 adapter_docs/classes/models/bert.rst          |  78 +----
 adapter_docs/classes/models/distilbert.rst    |  57 +---
 .../classes/models/encoderdecoder.rst         |   6 -
 adapter_docs/classes/models/gpt2.rst          |  83 +-----
 adapter_docs/classes/models/mbart.rst         |  77 +----
 adapter_docs/classes/models/roberta.rst       |  50 +---
 adapter_docs/classes/models/t5.rst            |  48 +--
 adapter_docs/classes/models/xlmroberta.rst    |  57 +---
 adapter_docs/classes/weights_loaders.rst      |  38 ---
 adapter_docs/index.rst                        |  24 +-
 adapter_docs/model_overview.md                |   3 +
 src/transformers/__init__.py                  | 150 +++++-----
 src/transformers/adapters/__init__.py         | 201 +++++++++++++
 src/transformers/adapters/mixins/__init__.py  |   0
 src/transformers/adapters/mixins/bart.py      |  50 ++++
 src/transformers/adapters/mixins/bert.py      |  34 +++
 .../adapters/mixins/distilbert.py             |  24 ++
 .../{models => mixins}/encoder_decoder.py     |   0
 src/transformers/adapters/mixins/gpt2.py      |  22 ++
 src/transformers/adapters/mixins/t5.py        |  47 +++
 src/transformers/adapters/models/auto.py      |  57 ++++
 src/transformers/adapters/models/bart.py      | 225 +++++++++++---
 src/transformers/adapters/models/bert.py      | 152 ++++++++--
 .../adapters/models/distilbert.py             | 278 ++++++++++++++++--
 src/transformers/adapters/models/gpt2.py      | 144 +++++++--
 src/transformers/adapters/models/mbart.py     | 260 ++++++++++++++++
 src/transformers/adapters/models/roberta.py   | 259 ++++++++++++++++
 src/transformers/adapters/models/t5.py        | 244 +++++++++++----
 .../adapters/models/xlm_roberta.py            |  29 ++
 src/transformers/models/auto/__init__.py      |   4 -
 src/transformers/models/auto/modeling_auto.py |  21 --
 src/transformers/models/bart/__init__.py      |   2 -
 src/transformers/models/bart/modeling_bart.py | 141 +--------
 src/transformers/models/bert/__init__.py      |   2 -
 src/transformers/models/bert/modeling_bert.py |  83 +-----
 .../models/distilbert/__init__.py             |   2 -
 .../models/distilbert/modeling_distilbert.py  |  87 +-----
 .../modeling_encoder_decoder.py               |   2 +-
 src/transformers/models/gpt2/__init__.py      |   2 -
 src/transformers/models/gpt2/modeling_gpt2.py |  84 +-----
 src/transformers/models/mbart/__init__.py     |   2 -
 .../models/mbart/modeling_mbart.py            | 141 +--------
 src/transformers/models/roberta/__init__.py   |   2 -
 .../models/roberta/modeling_roberta.py        |  90 +-----
 src/transformers/models/t5/__init__.py        |   2 -
 src/transformers/models/t5/modeling_t5.py     | 151 +---------
 .../models/xlm_roberta/__init__.py            |   2 -
 .../xlm_roberta/modeling_xlm_roberta.py       |  14 -
 49 files changed, 1974 insertions(+), 1608 deletions(-)
 delete mode 100644 adapter_docs/classes/weights_loaders.rst
 create mode 100644 adapter_docs/model_overview.md
 create mode 100644 src/transformers/adapters/mixins/__init__.py
 create mode 100644 src/transformers/adapters/mixins/bart.py
 create mode 100644 src/transformers/adapters/mixins/bert.py
 create mode 100644 src/transformers/adapters/mixins/distilbert.py
 rename src/transformers/adapters/{models => mixins}/encoder_decoder.py (100%)
 create mode 100644 src/transformers/adapters/mixins/gpt2.py
 create mode 100644 src/transformers/adapters/mixins/t5.py
 create mode 100644 src/transformers/adapters/models/auto.py
 create mode 100644 src/transformers/adapters/models/mbart.py
 create mode 100644 src/transformers/adapters/models/roberta.py
 create mode 100644 src/transformers/adapters/models/xlm_roberta.py

diff --git a/adapter_docs/classes/models/bart.rst b/adapter_docs/classes/models/bart.rst
index 414989c1c5..86630fd3c6 100644
--- a/adapter_docs/classes/models/bart.rst
+++ b/adapter_docs/classes/models/bart.rst
@@ -16,57 +16,10 @@ According to the abstract,
   state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
   of up to 6 ROUGE.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of BART in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/bart.html>`_.
 
-
-BartConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartConfig
-    :members:
-
-
-BartTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartTokenizer
-    :members:
-
-
-
-BartModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartModel
-    :members: forward
-
-
-BartModelWithHeads
+BartAdapterModel
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.BartModelWithHeads
+.. autoclass:: transformers.adapters.BartAdapterModel
     :members:
     :inherited-members: BartPretrainedModel
-
-
-BartForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForConditionalGeneration
-    :members: forward
-
-
-BartForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForSequenceClassification
-    :members: forward
-
-
-BartForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForQuestionAnswering
-    :members: forward
diff --git a/adapter_docs/classes/models/bert.rst b/adapter_docs/classes/models/bert.rst
index 6898c5c2d6..06695ad24d 100644
--- a/adapter_docs/classes/models/bert.rst
+++ b/adapter_docs/classes/models/bert.rst
@@ -5,84 +5,10 @@ The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transfo
 by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It is a bidirectional transformer
 pre-trained using a combination of masked language modeling objective and next sentence prediction.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of BERT in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/bert.html>`_.
 
-BertConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertConfig
-    :members:
-
-
-BertTokenizer
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-BertModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertModel
-    :members:
-
-
-BertModelWithHeads
+BertAdapterModel
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.BertModelWithHeads
+.. autoclass:: transformers.adapters.BertModelWithHeads
     :members:
     :inherited-members: BertPreTrainedModel
-
-
-BertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForPreTraining
-    :members:
-
-
-BertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForMaskedLM
-    :members:
-
-
-BertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForNextSentencePrediction
-    :members:
-
-
-BertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForSequenceClassification
-    :members:
-
-
-BertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForMultipleChoice
-    :members:
-
-
-BertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForTokenClassification
-    :members:
-
-
-BertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForQuestionAnswering
-    :members:
diff --git a/adapter_docs/classes/models/distilbert.rst b/adapter_docs/classes/models/distilbert.rst
index 053a827cfd..ec12de6764 100644
--- a/adapter_docs/classes/models/distilbert.rst
+++ b/adapter_docs/classes/models/distilbert.rst
@@ -8,63 +8,10 @@ DistilBERT is a small, fast, cheap and light Transformer model trained by distil
 parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on
 the GLUE language understanding benchmark.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of DistilBERT in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/distilbert.html>`_.
 
-
-DistilBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertConfig
-    :members:
-
-
-DistilBertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertTokenizer
-    :members:
-
-
-DistilBertTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertTokenizerFast
-    :members:
-
-
-DistilBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertModel
-    :members:
-
-
-DistilBertModelWithHeads
+DistilBertAdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.DistilBertModelWithHeads
+.. autoclass:: transformers.adapters.DistilBertAdapterModel
     :members:
     :inherited-members: DistilBertPreTrainedModel
-
-
-DistilBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForMaskedLM
-    :members:
-
-
-DistilBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForSequenceClassification
-    :members:
-
-
-DistilBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForQuestionAnswering
-    :members:
diff --git a/adapter_docs/classes/models/encoderdecoder.rst b/adapter_docs/classes/models/encoderdecoder.rst
index 7dd740b018..1e0f78ab0b 100644
--- a/adapter_docs/classes/models/encoderdecoder.rst
+++ b/adapter_docs/classes/models/encoderdecoder.rst
@@ -31,12 +31,6 @@ and decoder for a summarization model as was shown in: `Text Summarization with
     This class is nearly identical to the PyTorch implementation of DistilBERT in Huggingface Transformers.
     For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/distilbert.html>`_.
 
-EncoderDecoderConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.EncoderDecoderConfig
-    :members:
-
 
 EncoderDecoderModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/adapter_docs/classes/models/gpt2.rst b/adapter_docs/classes/models/gpt2.rst
index 7c03e7bd0a..bb0917dbb4 100644
--- a/adapter_docs/classes/models/gpt2.rst
+++ b/adapter_docs/classes/models/gpt2.rst
@@ -1,9 +1,6 @@
 OpenAI GPT2
 -----------------------------------------------------------------------------------------------------------------------
 
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 OpenAI GPT-2 model was proposed in `Language Models are Unsupervised Multitask Learners
 <https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_ by Alec
 Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
@@ -17,86 +14,10 @@ text. The diversity of the dataset causes this simple goal to contain naturally
 across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than
 10X the amount of data.*
 
-Tips:
-
-- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
-  observed in the `run_generation.py` example script.
-- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
-  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
-  this argument.
-
-`Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
-Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
-different sizes: small, medium, large, xl and a distilled version of the small checkpoint: `distilgpt-2`.
-
-.. note::
-    This class is nearly identical to the PyTorch implementation of BERT in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/bert.html>`_.
-
 
-GPT2Config
+GPT2AdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.GPT2Config
-    :members:
-
-
-GPT2Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2Tokenizer
-    :members: save_vocabulary
-
-
-GPT2TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2TokenizerFast
-    :members:
-
-
-GPT2 specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput
-    :members:
-
-
-GPT2Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2Model
-    :members: forward
-
-
-GPT2ModelWithHeads
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2ModelWithHeads
+.. autoclass:: transformers.adapters.GPT2AdapterModel
     :members:
     :inherited-members: GPT2PreTrainedModel
-
-
-GPT2LMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2LMHeadModel
-    :members: forward
-
-
-GPT2DoubleHeadsModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2DoubleHeadsModel
-    :members: forward
-
-
-GPT2ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2ForSequenceClassification
-    :members: forward
diff --git a/adapter_docs/classes/models/mbart.rst b/adapter_docs/classes/models/mbart.rst
index d2ab6f5c26..bc9106581e 100644
--- a/adapter_docs/classes/models/mbart.rst
+++ b/adapter_docs/classes/models/mbart.rst
@@ -10,83 +10,10 @@ corpora in many languages using the BART objective. mBART is one of the first me
 sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
 on the encoder, decoder, or reconstructing parts of the text.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of MBart in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/mbart.html>`_.
 
-
-MBartConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartConfig
-    :members:
-
-
-MBartTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartTokenizer
-    :members: as_target_tokenizer, build_inputs_with_special_tokens
-
-
-MBartTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartTokenizerFast
-    :members:
-
-
-MBart50Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBart50Tokenizer
-    :members:
-
-
-MBart50TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBart50TokenizerFast
-    :members:
-
-
-MBartModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartModel
-    :members:
-
-
-MBartModelWithHeads
+MBartAdapterModel
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartModelWithHeads
+.. autoclass:: transformers.adapters.MBartAdapterModel
     :members:
     :inherited-members: MBartPreTrainedModel
-
-
-MBartForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartForConditionalGeneration
-    :members:
-
-
-MBartForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartForQuestionAnswering
-    :members:
-
-
-MBartForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartForSequenceClassification
-
-
-MBartForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartForCausalLM
-    :members: forward
diff --git a/adapter_docs/classes/models/roberta.rst b/adapter_docs/classes/models/roberta.rst
index 3026733b65..3e429d3693 100644
--- a/adapter_docs/classes/models/roberta.rst
+++ b/adapter_docs/classes/models/roberta.rst
@@ -5,56 +5,10 @@ The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretrainin
 by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
 Veselin Stoyanov. It is based on Google's BERT model released in 2018.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of RoBERTa in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/roberta.html>`_.
 
-RobertaConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaConfig
-    :members:
-
-
-RobertaTokenizer
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-RobertaModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaModel
-    :members:
-
-
-RobertaModelWithHeads
+RobertaAdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.RobertaModelWithHeads
+.. autoclass:: transformers.adapters.RobertaAdapterModel
     :members:
     :inherited-members: RobertaPreTrainedModel
-
-
-RobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForMaskedLM
-    :members:
-
-
-RobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForSequenceClassification
-    :members:
-
-
-RobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForTokenClassification
-    :members:
diff --git a/adapter_docs/classes/models/t5.rst b/adapter_docs/classes/models/t5.rst
index 79b79034d0..b8be5993bd 100644
--- a/adapter_docs/classes/models/t5.rst
+++ b/adapter_docs/classes/models/t5.rst
@@ -16,54 +16,10 @@ The abstract from the paper is the following,
   For more information about which prefix to use, it is easiest to look into Appendix D of the `paper
   <https://arxiv.org/pdf/1910.10683.pdf>`__.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of T5 in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/t5.html>`_.
-  
-T5Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.T5Config
-    :members:
-
-
-T5Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5Tokenizer
-    :members:
-
-
-T5TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5TokenizerFast
-    :members:
-
-
-T5Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5Model
-    :members: forward
-
-T5ModelWithHeads
+T5AdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.T5ModelWithHeads
+.. autoclass:: transformers.adapters.T5AdapterModel
     :members:
     :inherited-members: T5PreTrainedModel
-
-
-
-T5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5ForConditionalGeneration
-    :members: forward
-
-T5EncoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5EncoderModel
-    :members: forward
diff --git a/adapter_docs/classes/models/xlmroberta.rst b/adapter_docs/classes/models/xlmroberta.rst
index 6ed7f3d1dc..6a0c8c9282 100644
--- a/adapter_docs/classes/models/xlmroberta.rst
+++ b/adapter_docs/classes/models/xlmroberta.rst
@@ -6,62 +6,9 @@ by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaum
 Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
 It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of XLM-RoBERTa in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/xlmroberta.html>`_.
 
-XLMRobertaConfig
+XLMRobertaAdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.XLMRobertaConfig
-    :members:
-
-
-XLMRobertaTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-XLMRobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaModel
-    :members:
-
-
-XLMRobertaModelWithHeads
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaModelWithHeads
-    :members:
-
-
-XLMRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForMaskedLM
-    :members:
-
-
-XLMRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForSequenceClassification
-    :members:
-
-
-XLMRobertaForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForMultipleChoice
-    :members:
-
-
-XLMRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForTokenClassification
+.. autoclass:: transformers.adapters.XLMRobertaAdapterModel
     :members:
diff --git a/adapter_docs/classes/weights_loaders.rst b/adapter_docs/classes/weights_loaders.rst
deleted file mode 100644
index e7b14bd63d..0000000000
--- a/adapter_docs/classes/weights_loaders.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-Weights Loaders
-=======================
-
-These classes perform the extraction, saving and loading of module weights to and from the file system.
-All type-specific loader classes inherit from the common ``WeightsLoader`` base class which can also be extended
-to add support for additional custom modules.
-
-These classes provide the basis of adapter module integration into model classes such as adapter saving and loading.
-Depending on the model, one of these mixins should be implemented by every adapter-supporting model class.
-
-WeightsLoader
-------------------
-
-.. autoclass:: transformers.WeightsLoader
-    :members:
-
-AdapterLoader
----------------------------
-
-.. autoclass:: transformers.AdapterLoader
-    :members:
-
-AdapterFusionLoader
----------------------------
-.. autoclass:: transformers.AdapterFusionLoader
-    :members:
-
-PredictionHeadLoader
----------------------------
-
-.. autoclass:: transformers.PredictionHeadLoader
-    :members:
-
-WeightsLoaderHelper
--------------------
-
-.. autoclass:: transformers.WeightsLoaderHelper
-    :members:
diff --git a/adapter_docs/index.rst b/adapter_docs/index.rst
index c337fb7597..715a0cd92b 100644
--- a/adapter_docs/index.rst
+++ b/adapter_docs/index.rst
@@ -43,22 +43,11 @@ Currently, we support the PyTorch versions of all models listed in the *Supporte
    contributing
    huggingface_hub
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Adapter-Related Classes
-
-   classes/adapter_config
-   classes/model_adapters_config
-   classes/adapter_modules
-   classes/adapter_layer
-   classes/model_mixins
-   classes/adapter_utils
-   classes/weights_loaders
-
 .. toctree::
    :maxdepth: 1
    :caption: Supported Models
 
+   model_overview
    classes/models/bart
    classes/models/bert
    classes/models/distilbert
@@ -69,6 +58,17 @@ Currently, we support the PyTorch versions of all models listed in the *Supporte
    classes/models/t5
    classes/models/xlmroberta
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Adapter-Related Classes
+
+   classes/adapter_config
+   classes/model_adapters_config
+   classes/adapter_modules
+   classes/adapter_layer
+   classes/model_mixins
+   classes/adapter_utils
+
 
 Citation
 ========
diff --git a/adapter_docs/model_overview.md b/adapter_docs/model_overview.md
new file mode 100644
index 0000000000..31738d1950
--- /dev/null
+++ b/adapter_docs/model_overview.md
@@ -0,0 +1,3 @@
+# Model Overview
+
+TODO-AH
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ee3a1dcf15..f8172f627b 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -617,7 +617,6 @@
             "AutoModelForSpeechSeq2Seq",
             "AutoModelForTableQuestionAnswering",
             "AutoModelForTokenClassification",
-            "AutoModelWithHeads",
             "AutoModelWithLMHead",
         ]
     )
@@ -629,7 +628,6 @@
             "BartForQuestionAnswering",
             "BartForSequenceClassification",
             "BartModel",
-            "BartModelWithHeads",
             "BartPretrainedModel",
             "PretrainedBartModel",
         ]
@@ -656,7 +654,6 @@
             "BertLayer",
             "BertLMHeadModel",
             "BertModel",
-            "BertModelWithHeads",
             "BertPreTrainedModel",
             "load_tf_weights_in_bert",
         ]
@@ -811,7 +808,6 @@
             "DistilBertForSequenceClassification",
             "DistilBertForTokenClassification",
             "DistilBertModel",
-            "DistilBertModelWithHeads",
             "DistilBertPreTrainedModel",
         ]
     )
@@ -895,7 +891,6 @@
             "GPT2ForTokenClassification",
             "GPT2LMHeadModel",
             "GPT2Model",
-            "GPT2ModelWithHeads",
             "GPT2PreTrainedModel",
             "load_tf_weights_in_gpt2",
         ]
@@ -1020,7 +1015,6 @@
             "MBartForQuestionAnswering",
             "MBartForSequenceClassification",
             "MBartModel",
-            "MBartModelWithHeads",
             "MBartPreTrainedModel",
         ]
     )
@@ -1139,7 +1133,6 @@
             "RobertaForSequenceClassification",
             "RobertaForTokenClassification",
             "RobertaModel",
-            "RobertaModelWithHeads",
             "RobertaPreTrainedModel",
         ]
     )
@@ -1225,7 +1218,6 @@
             "T5EncoderModel",
             "T5ForConditionalGeneration",
             "T5Model",
-            "T5ModelWithHeads",
             "T5PreTrainedModel",
             "load_tf_weights_in_t5",
         ]
@@ -1330,7 +1322,6 @@
             "XLMRobertaForSequenceClassification",
             "XLMRobertaForTokenClassification",
             "XLMRobertaModel",
-            "XLMRobertaModelWithHeads",
         ]
     )
     _import_structure["models.xlnet"].extend(
@@ -1368,57 +1359,57 @@
 
 # Adapters
 if is_torch_available():
-    _import_structure["adapters.configuration"] = [
+    _import_structure["adapters"] = [
+        "ADAPTER_CACHE",
         "ADAPTER_CONFIG_MAP",
         "ADAPTERFUSION_CONFIG_MAP",
+        "ADAPTER_MODEL_MAPPING",
         "DEFAULT_ADAPTER_CONFIG",
         "DEFAULT_ADAPTERFUSION_CONFIG",
+        "AdapterArguments",
         "AdapterConfig",
         "AdapterFusionConfig",
+        "AdapterInfo",
+        "AdapterLayer",
+        "AdapterSetup",
+        "AdapterTrainer",
+        "AdapterType",
+        "AutoAdapterModel",
+        "AutoModelWithHeads",
+        "BartAdapterModel",
+        "BartModelWithHeads",
+        "BertAdapterModel",
+        "BertModelWithHeads",
+        "DistilBertAdapterModel",
+        "DistilBertModelWithHeads",
         "DynamicAdapterFusionConfig",
+        "ForwardContext",
+        "GPT2AdapterModel",
+        "GPT2ModelWithHeads",
         "HoulsbyConfig",
         "HoulsbyInvConfig",
-        "ModelAdaptersConfig",
-        "PfeifferConfig",
-        "PfeifferInvConfig",
-        "StaticAdapterFusionConfig",
-    ]
-    _import_structure["adapters.context"] = ["AdapterSetup"]
-    _import_structure["adapters.heads"] = ["ModelWithFlexibleHeadsAdaptersMixin"]
-    _import_structure["adapters.layer"] = ["AdapterLayer"]
-    _import_structure["adapters.loading"] = [
-        "AdapterFusionLoader",
-        "AdapterLoader",
-        "PredictionHeadLoader",
-        "WeightsLoader",
-        "WeightsLoaderHelper",
-    ]
-    _import_structure["adapters.model_mixin"] = [
         "InvertibleAdaptersMixin",
+        "MBartAdapterModel",
+        "MBartModelWithHeads",
+        "ModelAdaptersConfig",
         "ModelAdaptersMixin",
         "ModelConfigAdaptersMixin",
+        "ModelWithFlexibleHeadsAdaptersMixin",
         "ModelWithHeadsAdaptersMixin",
-    ]
-    _import_structure["adapters.trainer"] = [
-        "AdapterTrainer",
-        "Seq2SeqAdapterTrainer",
-    ]
-    _import_structure["adapters.training"] = [
-        "AdapterArguments",
         "MultiLingAdapterArguments",
-    ]
-    _import_structure["adapters.utils"] = [
-        "ADAPTER_CACHE",
-        "ADAPTER_HUB_INDEX_FILE",
-        "ADAPTER_HUB_URL",
-        "AdapterInfo",
-        "AdapterType",
+        "PfeifferConfig",
+        "PfeifferInvConfig",
+        "RobertaAdapterModel",
+        "RobertaModelWithHeads",
+        "Seq2SeqAdapterTrainer",
+        "StaticAdapterFusionConfig",
+        "T5AdapterModel",
+        "T5ModelWithHeads",
+        "XLMRobertaAdapterModel",
+        "XLMRobertaModelWithHeads",
         "get_adapter_config_hash",
         "get_adapter_info",
         "list_adapters",
-        "pull_from_hub",
-        "resolve_adapter_config",
-        "resolve_adapter_path",
     ]
 
 # TensorFlow-backed objects
@@ -2528,7 +2519,6 @@
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
-            AutoModelWithHeads,
             AutoModelWithLMHead,
         )
         from .models.bart import (
@@ -2538,7 +2528,6 @@
             BartForQuestionAnswering,
             BartForSequenceClassification,
             BartModel,
-            BartModelWithHeads,
             BartPretrainedModel,
             PretrainedBartModel,
         )
@@ -2561,7 +2550,6 @@
             BertLayer,
             BertLMHeadModel,
             BertModel,
-            BertModelWithHeads,
             BertPreTrainedModel,
             load_tf_weights_in_bert,
         )
@@ -2688,7 +2676,6 @@
             DistilBertForSequenceClassification,
             DistilBertForTokenClassification,
             DistilBertModel,
-            DistilBertModelWithHeads,
             DistilBertPreTrainedModel,
         )
         from .models.dpr import (
@@ -2760,7 +2747,6 @@
             GPT2ForTokenClassification,
             GPT2LMHeadModel,
             GPT2Model,
-            GPT2ModelWithHeads,
             GPT2PreTrainedModel,
             load_tf_weights_in_gpt2,
         )
@@ -2861,7 +2847,6 @@
             MBartForQuestionAnswering,
             MBartForSequenceClassification,
             MBartModel,
-            MBartModelWithHeads,
             MBartPreTrainedModel,
         )
         from .models.megatron_bert import (
@@ -2963,7 +2948,6 @@
             RobertaForSequenceClassification,
             RobertaForTokenClassification,
             RobertaModel,
-            RobertaModelWithHeads,
             RobertaPreTrainedModel,
         )
         from .models.roformer import (
@@ -3033,7 +3017,6 @@
             T5EncoderModel,
             T5ForConditionalGeneration,
             T5Model,
-            T5ModelWithHeads,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
         )
@@ -3118,7 +3101,6 @@
             XLMRobertaForSequenceClassification,
             XLMRobertaForTokenClassification,
             XLMRobertaModel,
-            XLMRobertaModelWithHeads,
         )
         from .models.xlnet import (
             XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -3155,51 +3137,57 @@
 
     # Adapters
     if is_torch_available():
-        from .adapters.config import (
+        from .adapters import (
+            ADAPTER_CACHE,
             ADAPTER_CONFIG_MAP,
+            ADAPTER_MODEL_MAPPING,
             ADAPTERFUSION_CONFIG_MAP,
             DEFAULT_ADAPTER_CONFIG,
             DEFAULT_ADAPTERFUSION_CONFIG,
+            AdapterArguments,
             AdapterConfig,
             AdapterFusionConfig,
+            AdapterInfo,
+            AdapterLayer,
+            AdapterSetup,
+            AdapterTrainer,
+            AdapterType,
+            AutoAdapterModel,
+            AutoModelWithHeads,
+            BartAdapterModel,
+            BartModelWithHeads,
+            BertAdapterModel,
+            BertModelWithHeads,
+            DistilBertAdapterModel,
+            DistilBertModelWithHeads,
             DynamicAdapterFusionConfig,
+            ForwardContext,
+            GPT2AdapterModel,
+            GPT2ModelWithHeads,
             HoulsbyConfig,
             HoulsbyInvConfig,
-            ModelAdaptersConfig,
-            PfeifferConfig,
-            PfeifferInvConfig,
-            StaticAdapterFusionConfig,
-        )
-        from .adapters.context import AdapterSetup
-        from .adapters.heads import ModelWithFlexibleHeadsAdaptersMixin
-        from .adapters.layer import AdapterLayer
-        from .adapters.loading import (
-            AdapterFusionLoader,
-            AdapterLoader,
-            PredictionHeadLoader,
-            WeightsLoader,
-            WeightsLoaderHelper,
-        )
-        from .adapters.model_mixin import (
             InvertibleAdaptersMixin,
+            MBartAdapterModel,
+            MBartModelWithHeads,
+            ModelAdaptersConfig,
             ModelAdaptersMixin,
             ModelConfigAdaptersMixin,
+            ModelWithFlexibleHeadsAdaptersMixin,
             ModelWithHeadsAdaptersMixin,
-        )
-        from .adapters.trainer import AdapterTrainer, Seq2SeqAdapterTrainer
-        from .adapters.training import AdapterArguments, MultiLingAdapterArguments
-        from .adapters.utils import (
-            ADAPTER_CACHE,
-            ADAPTER_HUB_INDEX_FILE,
-            ADAPTER_HUB_URL,
-            AdapterInfo,
-            AdapterType,
+            MultiLingAdapterArguments,
+            PfeifferConfig,
+            PfeifferInvConfig,
+            RobertaAdapterModel,
+            RobertaModelWithHeads,
+            Seq2SeqAdapterTrainer,
+            StaticAdapterFusionConfig,
+            T5AdapterModel,
+            T5ModelWithHeads,
+            XLMRobertaAdapterModel,
+            XLMRobertaModelWithHeads,
             get_adapter_config_hash,
             get_adapter_info,
             list_adapters,
-            pull_from_hub,
-            resolve_adapter_config,
-            resolve_adapter_path,
         )
 
     # TensorFlow
diff --git a/src/transformers/adapters/__init__.py b/src/transformers/adapters/__init__.py
index e69de29bb2..1f343ae5a7 100644
--- a/src/transformers/adapters/__init__.py
+++ b/src/transformers/adapters/__init__.py
@@ -0,0 +1,201 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The Adapter-Hub Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..file_utils import _LazyModule
+
+
+_import_structure = {
+    "composition": [
+        "AdapterCompositionBlock",
+        "BatchSplit",
+        "Fuse",
+        "Parallel",
+        "Split",
+        "Stack",
+        "parse_composition",
+        "validate_composition",
+    ],
+    "configuration": [
+        "ADAPTER_CONFIG_MAP",
+        "ADAPTERFUSION_CONFIG_MAP",
+        "DEFAULT_ADAPTER_CONFIG",
+        "DEFAULT_ADAPTERFUSION_CONFIG",
+        "AdapterConfig",
+        "AdapterFusionConfig",
+        "DynamicAdapterFusionConfig",
+        "HoulsbyConfig",
+        "HoulsbyInvConfig",
+        "ModelAdaptersConfig",
+        "PfeifferConfig",
+        "PfeifferInvConfig",
+        "StaticAdapterFusionConfig",
+    ],
+    "context": [
+        "AdapterSetup",
+        "ForwardContext",
+    ],
+    "heads": [
+        "BertStyleMaskedLMHead",
+        "BiaffineParsingHead",
+        "CausalLMHead",
+        "ClassificationHead",
+        "DependencyParsingOutput",
+        "ModelWithFlexibleHeadsAdaptersMixin",
+        "MultiHeadOutput",
+        "MultiLabelClassificationHead",
+        "MultipleChoiceHead",
+        "PredictionHead",
+        "QuestionAnsweringHead",
+        "Seq2SeqLMHead",
+        "TaggingHead",
+    ],
+    "layer": ["AdapterLayer"],
+    "model_mixin": [
+        "InvertibleAdaptersMixin",
+        "ModelAdaptersMixin",
+        "ModelConfigAdaptersMixin",
+        "ModelWithHeadsAdaptersMixin",
+    ],
+    "models.auto": [
+        "ADAPTER_MODEL_MAPPING",
+        "AutoAdapterModel",
+        "AutoModelWithHeads",
+    ],
+    "models.bart": [
+        "BartAdapterModel",
+        "BartModelWithHeads",
+    ],
+    "models.bert": [
+        "BertAdapterModel",
+        "BertModelWithHeads",
+    ],
+    "models.distilbert": [
+        "DistilBertAdapterModel",
+        "DistilBertModelWithHeads",
+    ],
+    "models.gpt2": [
+        "GPT2AdapterModel",
+        "GPT2ModelWithHeads",
+    ],
+    "models.mbart": [
+        "MBartAdapterModel",
+        "MBartModelWithHeads",
+    ],
+    "models.roberta": [
+        "RobertaAdapterModel",
+        "RobertaModelWithHeads",
+    ],
+    "models.t5": [
+        "T5AdapterModel",
+        "T5ModelWithHeads",
+    ],
+    "models.xlm_roberta": [
+        "XLMRobertaAdapterModel",
+        "XLMRobertaModelWithHeads",
+    ],
+    "trainer": ["AdapterTrainer", "Seq2SeqAdapterTrainer"],
+    "training": [
+        "AdapterArguments",
+        "MultiLingAdapterArguments",
+    ],
+    "utils": [
+        "ADAPTER_CACHE",
+        "AdapterInfo",
+        "AdapterType",
+        "get_adapter_config_hash",
+        "get_adapter_info",
+        "list_adapters",
+    ],
+}
+
+
+if TYPE_CHECKING:
+    from .composition import (
+        AdapterCompositionBlock,
+        BatchSplit,
+        Fuse,
+        Parallel,
+        Split,
+        Stack,
+        parse_composition,
+        validate_composition,
+    )
+    from .configuration import (
+        ADAPTER_CONFIG_MAP,
+        ADAPTERFUSION_CONFIG_MAP,
+        DEFAULT_ADAPTER_CONFIG,
+        DEFAULT_ADAPTERFUSION_CONFIG,
+        AdapterConfig,
+        AdapterFusionConfig,
+        DynamicAdapterFusionConfig,
+        HoulsbyConfig,
+        HoulsbyInvConfig,
+        ModelAdaptersConfig,
+        PfeifferConfig,
+        PfeifferInvConfig,
+        StaticAdapterFusionConfig,
+    )
+    from .context import AdapterSetup, ForwardContext
+    from .heads import (
+        BertStyleMaskedLMHead,
+        BiaffineParsingHead,
+        CausalLMHead,
+        ClassificationHead,
+        DependencyParsingOutput,
+        ModelWithFlexibleHeadsAdaptersMixin,
+        MultiHeadOutput,
+        MultiLabelClassificationHead,
+        MultipleChoiceHead,
+        PredictionHead,
+        QuestionAnsweringHead,
+        Seq2SeqLMHead,
+        TaggingHead,
+    )
+    from .layer import AdapterLayer
+    from .model_mixin import (
+        InvertibleAdaptersMixin,
+        ModelAdaptersMixin,
+        ModelConfigAdaptersMixin,
+        ModelWithHeadsAdaptersMixin,
+    )
+    from .models.auto import ADAPTER_MODEL_MAPPING, AutoAdapterModel, AutoModelWithHeads
+    from .models.bart import BartAdapterModel, BartModelWithHeads
+    from .models.bert import BertAdapterModel, BertModelWithHeads
+    from .models.distilbert import DistilBertAdapterModel, DistilBertModelWithHeads
+    from .models.gpt2 import GPT2AdapterModel, GPT2ModelWithHeads
+    from .models.mbart import MBartAdapterModel, MBartModelWithHeads
+    from .models.roberta import RobertaAdapterModel, RobertaModelWithHeads
+    from .models.t5 import T5AdapterModel, T5ModelWithHeads
+    from .models.xlm_roberta import XLMRobertaAdapterModel, XLMRobertaModelWithHeads
+    from .trainer import AdapterTrainer, Seq2SeqAdapterTrainer
+    from .training import AdapterArguments, MultiLingAdapterArguments
+    from .utils import (
+        ADAPTER_CACHE,
+        AdapterInfo,
+        AdapterType,
+        get_adapter_config_hash,
+        get_adapter_info,
+        list_adapters,
+    )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/adapters/mixins/__init__.py b/src/transformers/adapters/mixins/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/transformers/adapters/mixins/bart.py b/src/transformers/adapters/mixins/bart.py
new file mode 100644
index 0000000000..d6c0894fc8
--- /dev/null
+++ b/src/transformers/adapters/mixins/bart.py
@@ -0,0 +1,50 @@
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+class BartEncoderLayerAdaptersMixin:
+    """Adds adapters to the BartEncoderLayer module of BART."""
+
+    def _init_adapter_modules(self):
+        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
+        self.output_adapters = AdapterLayer("output_adapter", self.config)
+        self.attention_adapters._init_adapter_modules()
+        self.output_adapters._init_adapter_modules()
+
+
+class BartDecoderLayerAdaptersMixin(BartEncoderLayerAdaptersMixin):
+    """Adds adapters to the BartDecoderLayer module of BART."""
+
+    def _init_adapter_modules(self):
+        super()._init_adapter_modules()
+        self.cross_attention_adapters = AdapterLayer("cross_adapter", self.config)
+        self.cross_attention_adapters._init_adapter_modules()
+
+
+class BartModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    """Adds adapters to the BartModel class."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        if hasattr(self, "encoder"):
+            for i, layer in enumerate(self.encoder.layers):
+                yield i, layer
+            for i, layer in enumerate(self.decoder.layers, start=len(self.encoder.layers)):
+                yield i, layer
+        else:
+            for i, layer in enumerate(self.decoder.layers):
+                yield i, layer
+
+    def _init_adapter_modules(self):
+        if hasattr(self, "encoder"):
+            # In BART, the invertible adapters are implemented by the encoder module.
+            # Therefore, relay mixin calls to the encoder here.
+            self.invertible_adapters = self.encoder.invertible_adapters
+            self.add_invertible_adapter = self.encoder.add_invertible_adapter
+            self.get_invertible_adapter = self.encoder.get_invertible_adapter
+            self.enable_invertible_adapters = self.encoder.enable_invertible_adapters
+            self.invertible_adapters_forward = self.encoder.invertible_adapters_forward
+        super()._init_adapter_modules()
diff --git a/src/transformers/adapters/mixins/bert.py b/src/transformers/adapters/mixins/bert.py
new file mode 100644
index 0000000000..3dba2957bd
--- /dev/null
+++ b/src/transformers/adapters/mixins/bert.py
@@ -0,0 +1,34 @@
+import logging
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+logger = logging.getLogger(__name__)
+
+
+# For backwards compatibility, BertSelfOutput inherits directly from AdapterLayer
+class BertSelfOutputAdaptersMixin(AdapterLayer):
+    """Adds adapters to the BertSelfOutput module."""
+
+    def __init__(self):
+        super().__init__("mh_adapter", None)
+
+
+# For backwards compatibility, BertOutput inherits directly from AdapterLayer
+class BertOutputAdaptersMixin(AdapterLayer):
+    """Adds adapters to the BertOutput module."""
+
+    def __init__(self):
+        super().__init__("output_adapter", None)
+
+
+class BertModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    """Adds adapters to the BertModel module."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        for i, layer in enumerate(self.encoder.layer):
+            yield i, layer
diff --git a/src/transformers/adapters/mixins/distilbert.py b/src/transformers/adapters/mixins/distilbert.py
new file mode 100644
index 0000000000..c3b431f1f8
--- /dev/null
+++ b/src/transformers/adapters/mixins/distilbert.py
@@ -0,0 +1,24 @@
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+class DistilBertTransfomerBlockAdaptersMixin:
+    """Adds adapters to the TransformerBlock module of DistilBert."""
+
+    def _init_adapter_modules(self):
+        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
+        self.output_adapters = AdapterLayer("output_adapter", self.config)
+        self.attention_adapters._init_adapter_modules()
+        self.output_adapters._init_adapter_modules()
+
+
+class DistilBertModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    """Adds adapters to the DistilBert module."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        for i, layer in enumerate(self.transformer.layer):
+            yield i, layer
diff --git a/src/transformers/adapters/models/encoder_decoder.py b/src/transformers/adapters/mixins/encoder_decoder.py
similarity index 100%
rename from src/transformers/adapters/models/encoder_decoder.py
rename to src/transformers/adapters/mixins/encoder_decoder.py
diff --git a/src/transformers/adapters/mixins/gpt2.py b/src/transformers/adapters/mixins/gpt2.py
new file mode 100644
index 0000000000..93cddbddea
--- /dev/null
+++ b/src/transformers/adapters/mixins/gpt2.py
@@ -0,0 +1,22 @@
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+class GPT2DecoderBlockAdaptersMixin:
+    """Adds adapters to the TransformerBlock module of DistilBert."""
+
+    def _init_adapter_modules(self):
+        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
+        self.output_adapters = AdapterLayer("output_adapter", self.config)
+        self.attention_adapters._init_adapter_modules()
+        self.output_adapters._init_adapter_modules()
+
+
+class GPT2ModelAdapterMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        for i, layer in enumerate(self.base_model.h):
+            yield i, layer
diff --git a/src/transformers/adapters/mixins/t5.py b/src/transformers/adapters/mixins/t5.py
new file mode 100644
index 0000000000..bd1d8efdc5
--- /dev/null
+++ b/src/transformers/adapters/mixins/t5.py
@@ -0,0 +1,47 @@
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+class T5SelfAttentionLayerAdaptersMixin(AdapterLayer):
+    def __init__(self):
+        super().__init__("mh_adapter", None)
+
+
+class T5CrossAttentionLayerAdaptersMixin(AdapterLayer):
+    def __init__(self):
+        super().__init__("cross_adapter", None)
+
+
+class T5FFLayerAdaptersMixin(AdapterLayer):
+    def __init__(self):
+        super().__init__("output_adapter", None)
+
+
+class T5ModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    """Adds adapters to the T5Model class."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        if hasattr(self, "encoder"):
+            for i, layer in enumerate(self.encoder.block):
+                yield i, layer
+            for i, layer in enumerate(self.decoder.block, start=len(self.encoder.block)):
+                yield i, layer
+        else:
+            for i, layer in enumerate(self.decoder.block):
+                yield i, layer
+
+    def _init_adapter_modules(self):
+        if hasattr(self, "encoder"):
+            # In T5, the invertible adapters are implemented by the encoder module.
+            # Therefore, relay mixin calls to the encoder here.
+            self.invertible_adapters = self.encoder.invertible_adapters
+            self.add_invertible_adapter = self.encoder.add_invertible_adapter
+            self.get_invertible_adapter = self.encoder.get_invertible_adapter
+            self.enable_invertible_adapters = self.encoder.enable_invertible_adapters
+            self.invertible_adapters_forward = self.encoder.invertible_adapters_forward
+            self.delete_invertible_adapter = self.encoder.delete_invertible_adapter
+        super()._init_adapter_modules()
diff --git a/src/transformers/adapters/models/auto.py b/src/transformers/adapters/models/auto.py
new file mode 100644
index 0000000000..7ed835af65
--- /dev/null
+++ b/src/transformers/adapters/models/auto.py
@@ -0,0 +1,57 @@
+import warnings
+from collections import OrderedDict
+
+from ...models.auto.auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
+from ...models.auto.configuration_auto import CONFIG_MAPPING_NAMES
+
+
+ADAPTER_MODEL_MAPPING_NAMES = OrderedDict(
+    [
+        ("xlm-roberta", "XLMRobertaModelWithHeads"),
+        ("roberta", "RobertaModelWithHeads"),
+        ("bert", "BertModelWithHeads"),
+        ("distilbert", "DistilBertModelWithHeads"),
+        ("bart", "BartModelWithHeads"),
+        ("mbart", "MBartModelWithHeads"),
+        ("gpt2", "GPT2ModelWithHeads"),
+        ("t5", "T5ModelWithHeads"),
+    ]
+)
+
+ADAPTER_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, ADAPTER_MODEL_MAPPING_NAMES)
+
+
+class AutoAdapterModel(_BaseAutoModelClass):
+    _model_mapping = ADAPTER_MODEL_MAPPING
+
+
+AutoAdapterModel = auto_class_update(AutoAdapterModel, head_doc="adapters and flexible heads")
+
+
+class AutoModelWithHeads(_BaseAutoModelClass):
+    _model_mapping = ADAPTER_MODEL_MAPPING
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+AutoModelWithHeads = auto_class_update(AutoModelWithHeads, head_doc="flexible heads")
diff --git a/src/transformers/adapters/models/bart.py b/src/transformers/adapters/models/bart.py
index 590b573566..d400a96aa2 100644
--- a/src/transformers/adapters/models/bart.py
+++ b/src/transformers/adapters/models/bart.py
@@ -1,7 +1,25 @@
-from typing import Iterable, Tuple
+import warnings
 
-import torch.nn as nn
+import torch
 
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.bart.modeling_bart import (
+    _CHECKPOINT_FOR_DOC,
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    BART_INPUTS_DOCSTRING,
+    BART_START_DOCSTRING,
+    BartConfig,
+    BartModel,
+    BartPretrainedModel,
+    shift_tokens_right,
+)
+from ..composition import adjust_tensors_for_parallel
 from ..heads import (
     ClassificationHead,
     ModelWithFlexibleHeadsAdaptersMixin,
@@ -9,58 +27,141 @@
     QuestionAnsweringHead,
     Seq2SeqLMHead,
 )
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-
 
-class BartEncoderLayerAdaptersMixin:
-    """Adds adapters to the BartEncoderLayer module of BART."""
 
-    def _init_adapter_modules(self):
-        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
-        self.output_adapters = AdapterLayer("output_adapter", self.config)
-        self.attention_adapters._init_adapter_modules()
-        self.output_adapters._init_adapter_modules()
+@add_start_docstrings(
+    "BART Model with the option to add multiple flexible prediction heads on top.", BART_START_DOCSTRING
+)
+class BartAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, BartPretrainedModel):
+    def __init__(self, config: BartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = BartModel(config)
 
+        self._init_head_modules()
 
-class BartDecoderLayerAdaptersMixin(BartEncoderLayerAdaptersMixin):
-    """Adds adapters to the BartDecoderLayer module of BART."""
+    def get_encoder(self):
+        return self.model.get_encoder()
 
-    def _init_adapter_modules(self):
-        super()._init_adapter_modules()
-        self.cross_attention_adapters = AdapterLayer("cross_adapter", self.config)
-        self.cross_attention_adapters._init_adapter_modules()
+    def get_decoder(self):
+        return self.model.get_decoder()
 
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-class BartModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    """Adds adapters to the BartModel class."""
+        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
+            use_cache = False
 
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        if hasattr(self, "encoder"):
-            for i, layer in enumerate(self.encoder.layers):
-                yield i, layer
-            for i, layer in enumerate(self.decoder.layers, start=len(self.encoder.layers)):
-                yield i, layer
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # sequence classification based on last token in sequence
+        x = outputs[0]  # last hidden state
+        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
+            eos_mask = input_ids.eq(self.config.eos_token_id)
+            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
+            if len(torch.unique(eos_mask.sum(1))) > 1:
+                raise ValueError("All examples must have the same number of <eos> tokens.")
+            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
         else:
-            for i, layer in enumerate(self.decoder.layers):
-                yield i, layer
-
-    def _init_adapter_modules(self):
-        if hasattr(self, "encoder"):
-            # In BART, the invertible adapters are implemented by the encoder module.
-            # Therefore, relay mixin calls to the encoder here.
-            self.invertible_adapters = self.encoder.invertible_adapters
-            self.add_invertible_adapter = self.encoder.add_invertible_adapter
-            self.get_invertible_adapter = self.encoder.get_invertible_adapter
-            self.enable_invertible_adapters = self.encoder.enable_invertible_adapters
-            self.invertible_adapters_forward = self.encoder.invertible_adapters_forward
-        super()._init_adapter_modules()
-
-
-class BartModelHeadsMixin(ModelWithFlexibleHeadsAdaptersMixin):
-    """
-    Adds flexible heads to a BART model.
-    """
+            cls_representation = x
+
+        head_outputs = self.forward_head(
+            outputs,
+            head_name=head,
+            cls_output=cls_representation,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        return head_outputs
+
+    # Copied from BartForConditionalGeneration
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    # Copied from BartForConditionalGeneration
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    # Copied from BartForConditionalGeneration
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
 
     head_types = {
         "classification": ClassificationHead,
@@ -123,3 +224,37 @@ def add_seq2seq_lm_head(
         """
         head = Seq2SeqLMHead(self, head_name)
         self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class BartModelWithHeads(BartAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/bert.py b/src/transformers/adapters/models/bert.py
index 6fd51986f4..31fbaad952 100644
--- a/src/transformers/adapters/models/bert.py
+++ b/src/transformers/adapters/models/bert.py
@@ -1,8 +1,21 @@
-import logging
-from typing import Iterable, Tuple
-
-import torch.nn as nn
+import warnings
 
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.bert.modeling_bert import (
+    _CHECKPOINT_FOR_DOC,
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    BERT_INPUTS_DOCSTRING,
+    BERT_START_DOCSTRING,
+    BertModel,
+    BertPreTrainedModel,
+)
+from ..context import AdapterSetup
 from ..heads import (
     BertStyleMaskedLMHead,
     BiaffineParsingHead,
@@ -14,41 +27,86 @@
     QuestionAnsweringHead,
     TaggingHead,
 )
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-
-
-logger = logging.getLogger(__name__)
-
 
-# For backwards compatibility, BertSelfOutput inherits directly from AdapterLayer
-class BertSelfOutputAdaptersMixin(AdapterLayer):
-    """Adds adapters to the BertSelfOutput module."""
-
-    def __init__(self):
-        super().__init__("mh_adapter", None)
 
+@add_start_docstrings(
+    """Bert Model transformer with the option to add multiple flexible heads on top.""",
+    BERT_START_DOCSTRING,
+)
+class BertAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
 
-# For backwards compatibility, BertOutput inherits directly from AdapterLayer
-class BertOutputAdaptersMixin(AdapterLayer):
-    """Adds adapters to the BertOutput module."""
+        self.bert = BertModel(config)
 
-    def __init__(self):
-        super().__init__("output_adapter", None)
+        self._init_head_modules()
 
+        self.init_weights()
 
-class BertModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    """Adds adapters to the BertModel module."""
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
 
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        for i, layer in enumerate(self.encoder.layer):
-            yield i, layer
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # BERT & RoBERTa return the pooled output as second item, we don't need that in these heads
+        if not return_dict:
+            head_inputs = (outputs[0],) + outputs[2:]
+        else:
+            head_inputs = outputs
+        pooled_output = outputs[1]
 
-class BertModelHeadsMixin(ModelWithFlexibleHeadsAdaptersMixin):
-    """
-    Adds flexible heads to a BERT-based model class.
-    """
+        if head or AdapterSetup.get_context_head_setup() or self.active_head:
+            head_outputs = self.forward_head(
+                head_inputs,
+                head_name=head,
+                attention_mask=attention_mask,
+                return_dict=return_dict,
+                pooled_output=pooled_output,
+                **kwargs,
+            )
+            return head_outputs
+        else:
+            # in case no head is used just return the output of the base model (including pooler output)
+            return outputs
 
     head_types = {
         "classification": ClassificationHead,
@@ -177,3 +235,37 @@ def add_causal_lm_head(self, head_name, activation_function="gelu", overwrite_ok
             self, head_name, layers=2, activation_function=activation_function, layer_norm=True, bias=True
         )
         self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class BertModelWithHeads(BertAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/distilbert.py b/src/transformers/adapters/models/distilbert.py
index 7b81664c9a..c48458ac86 100644
--- a/src/transformers/adapters/models/distilbert.py
+++ b/src/transformers/adapters/models/distilbert.py
@@ -1,31 +1,271 @@
-from typing import Iterable, Tuple
+import warnings
 
 import torch.nn as nn
 
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-from .bert import BertModelHeadsMixin
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.distilbert.modeling_distilbert import (
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    DISTILBERT_INPUTS_DOCSTRING,
+    DISTILBERT_START_DOCSTRING,
+    DistilBertModel,
+    DistilBertPreTrainedModel,
+)
+from ..heads import (
+    BertStyleMaskedLMHead,
+    BiaffineParsingHead,
+    CausalLMHead,
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    MultipleChoiceHead,
+    QuestionAnsweringHead,
+    TaggingHead,
+)
 
 
-class DistilBertTransfomerBlockAdaptersMixin:
-    """Adds adapters to the TransformerBlock module of DistilBert."""
+@add_start_docstrings(
+    """DistilBert Model transformer with the option to add multiple flexible heads on top.""",
+    DISTILBERT_START_DOCSTRING,
+)
+class DistilBertAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, DistilBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.distilbert = DistilBertModel(config)
 
-    def _init_adapter_modules(self):
-        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
-        self.output_adapters = AdapterLayer("output_adapter", self.config)
-        self.attention_adapters._init_adapter_modules()
-        self.output_adapters._init_adapter_modules()
+        self._init_head_modules()
 
+        self.init_weights()
 
-class DistilBertModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    """Adds adapters to the DistilBert module."""
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings
+        """
+        return self.distilbert.get_position_embeddings()
 
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        for i, layer in enumerate(self.transformer.layer):
-            yield i, layer
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
+        config.max_position_embeddings`.
 
+        Arguments:
+            new_num_position_embeddings (:obj:`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
+                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
+                the size will remove vectors from the end.
+        """
+        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
 
-class DistilBertModelHeadsMixin(BertModelHeadsMixin):
-    """Adds heads to a DistilBert model."""
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint="distilbert-base-uncased",
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-    pass
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        outputs = self.forward_head(
+            distilbert_output, head_name=head, attention_mask=attention_mask, return_dict=return_dict, **kwargs
+        )
+
+        return outputs
+
+    head_types = {
+        "classification": ClassificationHead,
+        "multilabel_classification": MultiLabelClassificationHead,
+        "tagging": TaggingHead,
+        "multiple_choice": MultipleChoiceHead,
+        "question_answering": QuestionAnsweringHead,
+        "dependency_parsing": BiaffineParsingHead,
+        "masked_lm": BertStyleMaskedLMHead,
+        "causal_lm": CausalLMHead,
+    }
+
+    def add_classification_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        multilabel=False,
+        id2label=None,
+        use_pooler=False,
+    ):
+        """
+        Adds a sequence classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            multilabel (bool, optional): Enable multilabel classification setup. Defaults to False.
+        """
+
+        if multilabel:
+            head = MultiLabelClassificationHead(
+                self, head_name, num_labels, layers, activation_function, id2label, use_pooler
+            )
+        else:
+            head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label, use_pooler)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_multiple_choice_head(
+        self,
+        head_name,
+        num_choices=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        id2label=None,
+        use_pooler=False,
+    ):
+        """
+        Adds a multiple choice head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_choices (int, optional): Number of choices. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = MultipleChoiceHead(self, head_name, num_choices, layers, activation_function, id2label, use_pooler)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_tagging_head(
+        self, head_name, num_labels=2, layers=1, activation_function="tanh", overwrite_ok=False, id2label=None
+    ):
+        """
+        Adds a token classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 1.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = TaggingHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_qa_head(
+        self, head_name, num_labels=2, layers=1, activation_function="tanh", overwrite_ok=False, id2label=None
+    ):
+        head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_dependency_parsing_head(self, head_name, num_labels=2, overwrite_ok=False, id2label=None):
+        """
+        Adds a biaffine dependency parsing head on top of the model. The parsing head uses the architecture described
+        in "Is Supervised Syntactic Parsing Beneficial for Language Understanding? An Empirical Investigation" (Glavaš
+        & Vulić, 2021) (https://arxiv.org/pdf/2008.06788.pdf).
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of labels. Defaults to 2.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            id2label (dict, optional): Mapping from label ids to labels. Defaults to None.
+        """
+        head = BiaffineParsingHead(self, head_name, num_labels, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_masked_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False):
+        """
+        Adds a masked language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            activation_function (str, optional): Activation function. Defaults to 'gelu'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = BertStyleMaskedLMHead(self, head_name, activation_function=activation_function)
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+    def add_causal_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False):
+        """
+        Adds a causal language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            activation_function (str, optional): Activation function. Defaults to 'gelu'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = CausalLMHead(
+            self, head_name, layers=2, activation_function=activation_function, layer_norm=True, bias=True
+        )
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class DistilBertModelWithHeads(DistilBertAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/gpt2.py b/src/transformers/adapters/models/gpt2.py
index b43ebd2577..0f30f2014d 100644
--- a/src/transformers/adapters/models/gpt2.py
+++ b/src/transformers/adapters/models/gpt2.py
@@ -1,31 +1,103 @@
-from typing import Iterable, Tuple
+import logging
+import warnings
 
-import torch.nn as nn
+import torch
 
-from ..heads import CausalLMHead, ClassificationHead, MultiLabelClassificationHead, TaggingHead
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-from .bert import ModelWithFlexibleHeadsAdaptersMixin
+from ...file_utils import add_start_docstrings
+from ...models.gpt2.modeling_gpt2 import GPT2_START_DOCSTRING, GPT2Model, GPT2PreTrainedModel
+from ..composition import adjust_tensors_for_parallel
+from ..heads import (
+    CausalLMHead,
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    TaggingHead,
+)
 
 
-class GPT2DecoderBlockAdaptersMixin:
-    """Adds adapters to the TransformerBlock module of DistilBert."""
+logger = logging.getLogger(__name__)
 
-    def _init_adapter_modules(self):
-        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
-        self.output_adapters = AdapterLayer("output_adapter", self.config)
-        self.attention_adapters._init_adapter_modules()
-        self.output_adapters._init_adapter_modules()
 
+@add_start_docstrings(
+    """
+The GPT2 Model that allows the loading of different heads dor different tasks. This enables a flexible use of the
+models and adpters. Since this class does classification on the last token, it requires to know the position of the
+last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding
+token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since
+it cannot guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same
+(take the last value in each row of the batch).
+""",
+    GPT2_START_DOCSTRING,
+)
+class GPT2AdapterModel(ModelWithFlexibleHeadsAdaptersMixin, GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPT2Model(config)
 
-class GPT2ModelAdapterMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        for i, layer in enumerate(self.base_model.h):
-            yield i, layer
+        self._init_head_modules()
 
+        self.init_weights()
 
-class GPT2ModelHeadsMixin(ModelWithFlexibleHeadsAdaptersMixin):
-    """Adds flexible heads to a GPT-2 model."""
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        batch_size = outputs[0].shape[0]
+
+        if self.config.pad_token_id is None:
+            # TODO-AH: this may result in unexpected behavior for classification. Find a better way to do this?
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                (sequence_lengths,) = adjust_tensors_for_parallel(outputs[0], sequence_lengths)
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        cls_logits = outputs[0][range(batch_size), sequence_lengths]
+
+        outputs = self.forward_head(
+            outputs,
+            head_name=head,
+            cls_output=cls_logits,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        return outputs
 
     head_types = {
         "classification": ClassificationHead,
@@ -72,3 +144,37 @@ def add_causal_lm_head(self, head_name, overwrite_ok=False):
         """
         head = CausalLMHead(self, head_name)
         self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class GPT2ModelWithHeads(GPT2AdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/mbart.py b/src/transformers/adapters/models/mbart.py
new file mode 100644
index 0000000000..5106b333e8
--- /dev/null
+++ b/src/transformers/adapters/models/mbart.py
@@ -0,0 +1,260 @@
+import warnings
+
+import torch
+
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.mbart.modeling_mbart import (
+    _CHECKPOINT_FOR_DOC,
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    MBART_INPUTS_DOCSTRING,
+    MBART_START_DOCSTRING,
+    MBartConfig,
+    MBartModel,
+    MBartPreTrainedModel,
+    shift_tokens_right,
+)
+from ..composition import adjust_tensors_for_parallel
+from ..heads import (
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    QuestionAnsweringHead,
+    Seq2SeqLMHead,
+)
+
+
+@add_start_docstrings(
+    "MBART Model with the option to add multiple flexible prediction heads on top.", MBART_START_DOCSTRING
+)
+class MBartAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, MBartPreTrainedModel):
+    def __init__(self, config: MBartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = MBartModel(config)
+
+        self._init_head_modules()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # sequence classification based on last token in sequence
+        x = outputs[0]  # last hidden state
+        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
+            eos_mask = input_ids.eq(self.config.eos_token_id)
+            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
+            if len(torch.unique(eos_mask.sum(1))) > 1:
+                raise ValueError("All examples must have the same number of <eos> tokens.")
+            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
+        else:
+            cls_representation = x
+
+        head_outputs = self.forward_head(
+            outputs,
+            head_name=head,
+            cls_output=cls_representation,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        return head_outputs
+
+    # Copied from MBartForConditionalGeneration
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    # Copied from MBartForConditionalGeneration
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id)
+
+    # Copied from MBartForConditionalGeneration
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+    head_types = {
+        "classification": ClassificationHead,
+        "multilabel_classification": MultiLabelClassificationHead,
+        "question_answering": QuestionAnsweringHead,
+        "seq2seq_lm": Seq2SeqLMHead,
+    }
+
+    def add_classification_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        multilabel=False,
+        id2label=None,
+    ):
+        """
+        Adds a sequence classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            multilabel (bool, optional): Enable multilabel classification setup. Defaults to False.
+        """
+
+        if multilabel:
+            head = MultiLabelClassificationHead(self, head_name, num_labels, layers, activation_function, id2label)
+        else:
+            head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_qa_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=1,
+        activation_function="tanh",
+        overwrite_ok=False,
+        id2label=None,
+    ):
+        head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_seq2seq_lm_head(
+        self,
+        head_name,
+        overwrite_ok=False,
+    ):
+        """
+        Adds a sequence-to-sequence language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = Seq2SeqLMHead(self, head_name)
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class MBartModelWithHeads(MBartAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/roberta.py b/src/transformers/adapters/models/roberta.py
new file mode 100644
index 0000000000..514a9d9a4e
--- /dev/null
+++ b/src/transformers/adapters/models/roberta.py
@@ -0,0 +1,259 @@
+import warnings
+
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.roberta.modeling_roberta import (
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    ROBERTA_INPUTS_DOCSTRING,
+    ROBERTA_START_DOCSTRING,
+    RobertaModel,
+    RobertaPreTrainedModel,
+)
+from ..context import AdapterSetup
+from ..heads import (
+    BertStyleMaskedLMHead,
+    BiaffineParsingHead,
+    CausalLMHead,
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    MultipleChoiceHead,
+    QuestionAnsweringHead,
+    TaggingHead,
+)
+
+
+@add_start_docstrings(
+    """Roberta Model transformer with the option to add multiple flexible heads on top.""",
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, RobertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = RobertaModel(config)
+
+        self._init_head_modules()
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint="roberta-base",
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # BERT & RoBERTa return the pooled output as second item, we don't need that in these heads
+        if not return_dict:
+            head_inputs = (outputs[0],) + outputs[2:]
+        else:
+            head_inputs = outputs
+        pooled_output = outputs[1]
+
+        if head or AdapterSetup.get_context_head_setup() or self.active_head:
+            head_outputs = self.forward_head(
+                head_inputs,
+                head_name=head,
+                attention_mask=attention_mask,
+                return_dict=return_dict,
+                pooled_output=pooled_output,
+                **kwargs,
+            )
+            return head_outputs
+        else:
+            # in case no head is used just return the output of the base model (including pooler output)
+            return outputs
+
+    def add_classification_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        multilabel=False,
+        id2label=None,
+        use_pooler=False,
+    ):
+        """
+        Adds a sequence classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            multilabel (bool, optional): Enable multilabel classification setup. Defaults to False.
+        """
+
+        if multilabel:
+            head = MultiLabelClassificationHead(
+                self, head_name, num_labels, layers, activation_function, id2label, use_pooler
+            )
+        else:
+            head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label, use_pooler)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_multiple_choice_head(
+        self,
+        head_name,
+        num_choices=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        id2label=None,
+        use_pooler=False,
+    ):
+        """
+        Adds a multiple choice head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_choices (int, optional): Number of choices. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = MultipleChoiceHead(self, head_name, num_choices, layers, activation_function, id2label, use_pooler)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_tagging_head(
+        self, head_name, num_labels=2, layers=1, activation_function="tanh", overwrite_ok=False, id2label=None
+    ):
+        """
+        Adds a token classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 1.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = TaggingHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_qa_head(
+        self, head_name, num_labels=2, layers=1, activation_function="tanh", overwrite_ok=False, id2label=None
+    ):
+        head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_dependency_parsing_head(self, head_name, num_labels=2, overwrite_ok=False, id2label=None):
+        """
+        Adds a biaffine dependency parsing head on top of the model. The parsing head uses the architecture described
+        in "Is Supervised Syntactic Parsing Beneficial for Language Understanding? An Empirical Investigation" (Glavaš
+        & Vulić, 2021) (https://arxiv.org/pdf/2008.06788.pdf).
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of labels. Defaults to 2.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            id2label (dict, optional): Mapping from label ids to labels. Defaults to None.
+        """
+        head = BiaffineParsingHead(self, head_name, num_labels, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_masked_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False):
+        """
+        Adds a masked language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            activation_function (str, optional): Activation function. Defaults to 'gelu'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = BertStyleMaskedLMHead(self, head_name, activation_function=activation_function)
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+    def add_causal_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False):
+        """
+        Adds a causal language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            activation_function (str, optional): Activation function. Defaults to 'gelu'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = CausalLMHead(
+            self, head_name, layers=2, activation_function=activation_function, layer_norm=True, bias=True
+        )
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class RobertaModelWithHeads(RobertaAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/t5.py b/src/transformers/adapters/models/t5.py
index fa344bc5b4..c1fa3021b1 100644
--- a/src/transformers/adapters/models/t5.py
+++ b/src/transformers/adapters/models/t5.py
@@ -1,56 +1,162 @@
-from typing import Iterable, Tuple
-
-import torch.nn as nn
-
-from ..heads import Seq2SeqLMHead
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-from .bert import ModelWithFlexibleHeadsAdaptersMixin
-
-
-class T5SelfAttentionLayerAdaptersMixin(AdapterLayer):
-    def __init__(self):
-        super().__init__("mh_adapter", None)
-
-
-class T5CrossAttentionLayerAdaptersMixin(AdapterLayer):
-    def __init__(self):
-        super().__init__("cross_adapter", None)
-
-
-class T5FFLayerAdaptersMixin(AdapterLayer):
-    def __init__(self):
-        super().__init__("output_adapter", None)
-
-
-class T5ModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    """Adds adapters to the T5Model class."""
-
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        if hasattr(self, "encoder"):
-            for i, layer in enumerate(self.encoder.block):
-                yield i, layer
-            for i, layer in enumerate(self.decoder.block, start=len(self.encoder.block)):
-                yield i, layer
+import logging
+import warnings
+
+import torch
+
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from ...models.t5.modeling_t5 import T5_INPUTS_DOCSTRING, T5_START_DOCSTRING, T5Model, T5PreTrainedModel
+from ..heads import ModelWithFlexibleHeadsAdaptersMixin, Seq2SeqLMHead
+
+
+logger = logging.getLogger(__name__)
+
+
+@add_start_docstrings("T5 Model with the option to add multiple flexible prediction heads on top.", T5_START_DOCSTRING)
+class T5AdapterModel(ModelWithFlexibleHeadsAdaptersMixin, T5PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = T5Model(config)
+
+        self._init_head_modules()
+        self._init_adapter_modules()
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def get_encoder(self):
+        return self.transformer.encoder
+
+    def get_decoder(self):
+        return self.transformer.decoder
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        model_output = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = model_output[0]
+        # ToDo move head to device for parallel forward pass
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            new_hidden_state = sequence_output * (self.config.d_model ** -0.5)
+            if isinstance(model_output, tuple):
+                model_output = (new_hidden_state,) + model_output[1:]
+            else:
+                model_output["last_hidden_state"] = new_hidden_state
+
+        if head or self.active_head:
+            kwargs["labels"] = labels
+            head_outputs = self.forward_head(
+                model_output,
+                head_name=head,
+                return_dict=return_dict,
+                **kwargs,
+            )
+            return head_outputs
         else:
-            for i, layer in enumerate(self.decoder.block):
-                yield i, layer
-
-    def _init_adapter_modules(self):
-        if hasattr(self, "encoder"):
-            # In T5, the invertible adapters are implemented by the encoder module.
-            # Therefore, relay mixin calls to the encoder here.
-            self.invertible_adapters = self.encoder.invertible_adapters
-            self.add_invertible_adapter = self.encoder.add_invertible_adapter
-            self.get_invertible_adapter = self.encoder.get_invertible_adapter
-            self.enable_invertible_adapters = self.encoder.enable_invertible_adapters
-            self.invertible_adapters_forward = self.encoder.invertible_adapters_forward
-            self.delete_invertible_adapter = self.encoder.delete_invertible_adapter
-        super()._init_adapter_modules()
-
-
-class T5ModelHeadsMixin(ModelWithFlexibleHeadsAdaptersMixin):
-    """Adds flexible heads to a T5 model."""
+            return model_output
+
+    # Copied from T5ForConditionalGeneration
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    # Copied from T5ForConditionalGeneration
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    # Copied from T5ForConditionalGeneration
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
+                )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+        return reordered_decoder_past
 
     head_types = {
         "seq2seq_lm": Seq2SeqLMHead,
@@ -66,3 +172,37 @@ def add_seq2seq_lm_head(self, head_name, overwrite_ok=False):
         """
         head = Seq2SeqLMHead(self, head_name)
         self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class T5ModelWithHeads(T5AdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/xlm_roberta.py b/src/transformers/adapters/models/xlm_roberta.py
new file mode 100644
index 0000000000..62424f4635
--- /dev/null
+++ b/src/transformers/adapters/models/xlm_roberta.py
@@ -0,0 +1,29 @@
+from ...file_utils import add_start_docstrings
+from ...models.xlm_roberta.modeling_xlm_roberta import XLM_ROBERTA_START_DOCSTRING, XLMRobertaConfig
+from .roberta import RobertaAdapterModel, RobertaModelWithHeads
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa Model with the option to add multiple flexible heads on top.""",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class XLMRobertaAdapterModel(RobertaAdapterModel):
+    """
+    This class overrides :class:`~transformers.RobertaAdapterModel`. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = XLMRobertaConfig
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa Model with the option to add multiple flexible heads on top.""",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class XLMRobertaModelWithHeads(RobertaModelWithHeads):
+    """
+    This class overrides :class:`~transformers.RobertaModelWithHeads`. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = XLMRobertaConfig
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 34f6447bb5..98133afee2 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -47,7 +47,6 @@
         "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
         "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "MODEL_MAPPING",
-        "MODEL_WITH_HEADS_MAPPING",
         "MODEL_WITH_LM_HEAD_MAPPING",
         "AutoModel",
         "AutoModelForAudioClassification",
@@ -66,7 +65,6 @@
         "AutoModelForSpeechSeq2Seq",
         "AutoModelForTableQuestionAnswering",
         "AutoModelForTokenClassification",
-        "AutoModelWithHeads",
         "AutoModelWithLMHead",
     ]
 
@@ -147,7 +145,6 @@
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_MAPPING,
-            MODEL_WITH_HEADS_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
             AutoModelForAudioClassification,
@@ -166,7 +163,6 @@
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
-            AutoModelWithHeads,
             AutoModelWithLMHead,
         )
 
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 12770253b1..923d1fe594 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -100,19 +100,6 @@
     ]
 )
 
-MODEL_WITH_HEADS_MAPPING_NAMES = OrderedDict(
-    [
-        ("xlm-roberta", "XLMRobertaModelWithHeads"),
-        ("roberta", "RobertaModelWithHeads"),
-        ("bert", "BertModelWithHeads"),
-        ("distilbert", "DistilBertModelWithHeads"),
-        ("bart", "BartModelWithHeads"),
-        ("mbart", "MBartModelWithHeads"),
-        ("gpt2", "GPT2ModelWithHeads"),
-        ("t5", "T5ModelWithHeads"),
-    ]
-)
-
 MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
     [
         # Model for pre-training mapping
@@ -515,7 +502,6 @@
 )
 
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
-MODEL_WITH_HEADS_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_HEADS_MAPPING_NAMES)
 MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
 MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES)
 MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
@@ -567,13 +553,6 @@ class AutoModelForPreTraining(_BaseAutoModelClass):
 AutoModelForPreTraining = auto_class_update(AutoModelForPreTraining, head_doc="pretraining")
 
 
-class AutoModelWithHeads(_BaseAutoModelClass):
-    _model_mapping = MODEL_WITH_HEADS_MAPPING
-
-
-AutoModelWithHeads = auto_class_update(AutoModelWithHeads, head_doc="flexible heads")
-
-
 # Private on purpose, the public class will add the deprecation warnings.
 class _AutoModelWithLMHead(_BaseAutoModelClass):
     _model_mapping = MODEL_WITH_LM_HEAD_MAPPING
diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py
index ba62b452ab..a8ddcecc41 100644
--- a/src/transformers/models/bart/__init__.py
+++ b/src/transformers/models/bart/__init__.py
@@ -36,7 +36,6 @@
         "BartForQuestionAnswering",
         "BartForSequenceClassification",
         "BartModel",
-        "BartModelWithHeads",
         "BartPretrainedModel",
         "PretrainedBartModel",
     ]
@@ -68,7 +67,6 @@
             BartForQuestionAnswering,
             BartForSequenceClassification,
             BartModel,
-            BartModelWithHeads,
             BartPretrainedModel,
             PretrainedBartModel,
         )
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 4ca10efb3c..31dc4a4552 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -27,15 +27,13 @@
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
-from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
-from ...adapters.models.bart import (
+from ...adapters.mixins.bart import (
     BartDecoderLayerAdaptersMixin,
     BartEncoderLayerAdaptersMixin,
     BartModelAdaptersMixin,
-    BartModelHeadsMixin,
 )
+from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
 from ...file_utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
@@ -1251,141 +1249,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    "BART Model with the option to add multiple flexible prediction heads on top.", BART_START_DOCSTRING
-)
-class BartModelWithHeads(BartModelHeadsMixin, BartPretrainedModel):
-    def __init__(self, config: BartConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = BartModel(config)
-
-        self._init_head_modules()
-
-    def get_encoder(self):
-        return self.model.get_encoder()
-
-    def get_decoder(self):
-        return self.model.get_decoder()
-
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=ModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # sequence classification based on last token in sequence
-        x = outputs[0]  # last hidden state
-        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
-            eos_mask = input_ids.eq(self.config.eos_token_id)
-            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
-            if len(torch.unique(eos_mask.sum(1))) > 1:
-                raise ValueError("All examples must have the same number of <eos> tokens.")
-            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
-        else:
-            cls_representation = x
-
-        head_outputs = self.forward_head(
-            outputs,
-            head_name=head,
-            cls_output=cls_representation,
-            attention_mask=attention_mask,
-            return_dict=return_dict,
-            **kwargs,
-        )
-
-        return head_outputs
-
-    # Copied from BartForConditionalGeneration
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-    ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    # Copied from BartForConditionalGeneration
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
-
-    # Copied from BartForConditionalGeneration
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            # cached cross_attention states don't have to be reordered -> they are always the same
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
-            )
-        return reordered_past
-
-
 @add_start_docstrings(
     "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
 )
diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py
index 1c539fb082..9bcf372282 100644
--- a/src/transformers/models/bert/__init__.py
+++ b/src/transformers/models/bert/__init__.py
@@ -42,7 +42,6 @@
         "BertLayer",
         "BertLMHeadModel",
         "BertModel",
-        "BertModelWithHeads",
         "BertPreTrainedModel",
         "load_tf_weights_in_bert",
     ]
@@ -97,7 +96,6 @@
             BertLayer,
             BertLMHeadModel,
             BertModel,
-            BertModelWithHeads,
             BertPreTrainedModel,
             load_tf_weights_in_bert,
         )
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 704b0141a5..c53e4e1dce 100644
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -30,14 +30,9 @@
 
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
-from ...adapters.context import AdapterSetup, ForwardContext
+from ...adapters.context import ForwardContext
+from ...adapters.mixins.bert import BertModelAdaptersMixin, BertOutputAdaptersMixin, BertSelfOutputAdaptersMixin
 from ...adapters.model_mixin import ModelWithHeadsAdaptersMixin
-from ...adapters.models.bert import (
-    BertModelAdaptersMixin,
-    BertModelHeadsMixin,
-    BertOutputAdaptersMixin,
-    BertSelfOutputAdaptersMixin,
-)
 from ...file_utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1045,80 +1040,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    """Bert Model transformer with the option to add multiple flexible heads on top.""",
-    BERT_START_DOCSTRING,
-)
-class BertModelWithHeads(BertModelHeadsMixin, BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-
-        self._init_head_modules()
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-            if inputs_embeds is not None
-            else None
-        )
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # BERT & RoBERTa return the pooled output as second item, we don't need that in these heads
-        if not return_dict:
-            head_inputs = (outputs[0],) + outputs[2:]
-        else:
-            head_inputs = outputs
-        pooled_output = outputs[1]
-
-        if head or AdapterSetup.get_context_head_setup() or self.active_head:
-            head_outputs = self.forward_head(
-                head_inputs,
-                head_name=head,
-                attention_mask=attention_mask,
-                return_dict=return_dict,
-                pooled_output=pooled_output,
-                **kwargs,
-            )
-            return head_outputs
-        else:
-            # in case no head is used just return the output of the base model (including pooler output)
-            return outputs
-
-
 @add_start_docstrings(
     """
     Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
diff --git a/src/transformers/models/distilbert/__init__.py b/src/transformers/models/distilbert/__init__.py
index 1a7fdac64b..3be3dda0fa 100644
--- a/src/transformers/models/distilbert/__init__.py
+++ b/src/transformers/models/distilbert/__init__.py
@@ -42,7 +42,6 @@
         "DistilBertForSequenceClassification",
         "DistilBertForTokenClassification",
         "DistilBertModel",
-        "DistilBertModelWithHeads",
         "DistilBertPreTrainedModel",
     ]
 
@@ -91,7 +90,6 @@
             DistilBertForSequenceClassification,
             DistilBertForTokenClassification,
             DistilBertModel,
-            DistilBertModelWithHeads,
             DistilBertPreTrainedModel,
         )
 
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index d686b61cc5..56e5bcb71d 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -29,15 +29,10 @@
 from ...activations import gelu
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
+from ...adapters.mixins.distilbert import DistilBertModelAdaptersMixin, DistilBertTransfomerBlockAdaptersMixin
 from ...adapters.model_mixin import ModelWithHeadsAdaptersMixin
-from ...adapters.models.distilbert import (
-    DistilBertModelAdaptersMixin,
-    DistilBertModelHeadsMixin,
-    DistilBertTransfomerBlockAdaptersMixin,
-)
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...file_utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -576,86 +571,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    """DistilBert Model transformer with the option to add multiple flexible heads on top.""",
-    DISTILBERT_START_DOCSTRING,
-)
-class DistilBertModelWithHeads(DistilBertModelHeadsMixin, DistilBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.distilbert = DistilBertModel(config)
-
-        self._init_head_modules()
-
-        self.init_weights()
-
-    def get_position_embeddings(self) -> nn.Embedding:
-        """
-        Returns the position embeddings
-        """
-        return self.distilbert.get_position_embeddings()
-
-    def resize_position_embeddings(self, new_num_position_embeddings: int):
-        """
-        Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
-        config.max_position_embeddings`.
-
-        Arguments:
-            new_num_position_embeddings (:obj:`int`):
-                The number of new position embedding matrix. If position embeddings are learned, increasing the size
-                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
-                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
-                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
-                the size will remove vectors from the end.
-        """
-        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
-
-    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="distilbert-base-uncased",
-        output_type=ModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-            if inputs_embeds is not None
-            else None
-        )
-
-        distilbert_output = self.distilbert(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        outputs = self.forward_head(
-            distilbert_output, head_name=head, attention_mask=attention_mask, return_dict=return_dict, **kwargs
-        )
-
-        return outputs
-
-
 @add_start_docstrings(
     """DistilBert Model with a `masked language modeling` head on top. """,
     DISTILBERT_START_DOCSTRING,
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 8eeeb9cbd7..88081c65ef 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -21,7 +21,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...adapters.context import ForwardContext
-from ...adapters.models.encoder_decoder import EncoderDecoderModelAdaptersMixin
+from ...adapters.mixins.encoder_decoder import EncoderDecoderModelAdaptersMixin
 from ...configuration_utils import PretrainedConfig
 from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import Seq2SeqLMOutput
diff --git a/src/transformers/models/gpt2/__init__.py b/src/transformers/models/gpt2/__init__.py
index 12ab7099ea..7169ddc63f 100644
--- a/src/transformers/models/gpt2/__init__.py
+++ b/src/transformers/models/gpt2/__init__.py
@@ -37,7 +37,6 @@
         "GPT2ForTokenClassification",
         "GPT2LMHeadModel",
         "GPT2Model",
-        "GPT2ModelWithHeads",
         "GPT2PreTrainedModel",
         "load_tf_weights_in_gpt2",
     ]
@@ -71,7 +70,6 @@
             GPT2ForTokenClassification,
             GPT2LMHeadModel,
             GPT2Model,
-            GPT2ModelWithHeads,
             GPT2PreTrainedModel,
             load_tf_weights_in_gpt2,
         )
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 0f6fe2c02a..3a89e62e9a 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -36,8 +36,8 @@
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
+from ...adapters.mixins.gpt2 import GPT2DecoderBlockAdaptersMixin, GPT2ModelAdapterMixin
 from ...adapters.model_mixin import ModelWithHeadsAdaptersMixin
-from ...adapters.models.gpt2 import GPT2DecoderBlockAdaptersMixin, GPT2ModelAdapterMixin, GPT2ModelHeadsMixin
 from ...file_utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1542,85 +1542,3 @@ def forward(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-
-@add_start_docstrings(
-    """
-The GPT2 Model that allows the loading of different heads dor different tasks. This enables a flexible use of the
-models and adpters. Since this class does classification on the last token, it requires to know the position of the
-last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding
-token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since
-it cannot guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same
-(take the last value in each row of the batch).
-""",
-    GPT2_START_DOCSTRING,
-)
-class GPT2ModelWithHeads(GPT2ModelHeadsMixin, GPT2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = GPT2Model(config)
-
-        self._init_head_modules()
-
-        self.init_weights()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        batch_size = outputs[0].shape[0]
-
-        if self.config.pad_token_id is None:
-            # TODO-AH: this may result in unexpected behavior for classification. Find a better way to do this?
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
-                (sequence_lengths,) = adjust_tensors_for_parallel(outputs[0], sequence_lengths)
-            else:
-                sequence_lengths = -1
-                logger.warning(
-                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-                )
-
-        cls_logits = outputs[0][range(batch_size), sequence_lengths]
-
-        outputs = self.forward_head(
-            outputs,
-            head_name=head,
-            cls_output=cls_logits,
-            attention_mask=attention_mask,
-            return_dict=return_dict,
-            **kwargs,
-        )
-
-        return outputs
diff --git a/src/transformers/models/mbart/__init__.py b/src/transformers/models/mbart/__init__.py
index 2363c55840..613c90afbe 100644
--- a/src/transformers/models/mbart/__init__.py
+++ b/src/transformers/models/mbart/__init__.py
@@ -45,7 +45,6 @@
         "MBartForQuestionAnswering",
         "MBartForSequenceClassification",
         "MBartModel",
-        "MBartModelWithHeads",
         "MBartPreTrainedModel",
     ]
 
@@ -83,7 +82,6 @@
             MBartForQuestionAnswering,
             MBartForSequenceClassification,
             MBartModel,
-            MBartModelWithHeads,
             MBartPreTrainedModel,
         )
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 3f956ae6b2..0a998b6867 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -26,15 +26,13 @@
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
-from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
-from ...adapters.models.bart import (
+from ...adapters.mixins.bart import (
     BartDecoderLayerAdaptersMixin,
     BartEncoderLayerAdaptersMixin,
     BartModelAdaptersMixin,
-    BartModelHeadsMixin,
 )
+from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
 from ...file_utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
@@ -1251,141 +1249,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    "MBART Model with the option to add multiple flexible prediction heads on top.", MBART_START_DOCSTRING
-)
-class MBartModelWithHeads(BartModelHeadsMixin, MBartPreTrainedModel):
-    def __init__(self, config: MBartConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = MBartModel(config)
-
-        self._init_head_modules()
-
-    def get_encoder(self):
-        return self.model.get_encoder()
-
-    def get_decoder(self):
-        return self.model.get_decoder()
-
-    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/mbart-large-cc25",
-        output_type=ModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # sequence classification based on last token in sequence
-        x = outputs[0]  # last hidden state
-        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
-            eos_mask = input_ids.eq(self.config.eos_token_id)
-            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
-            if len(torch.unique(eos_mask.sum(1))) > 1:
-                raise ValueError("All examples must have the same number of <eos> tokens.")
-            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
-        else:
-            cls_representation = x
-
-        head_outputs = self.forward_head(
-            outputs,
-            head_name=head,
-            cls_output=cls_representation,
-            attention_mask=attention_mask,
-            return_dict=return_dict,
-            **kwargs,
-        )
-
-        return head_outputs
-
-    # Copied from MBartForConditionalGeneration
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-    ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    # Copied from MBartForConditionalGeneration
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id)
-
-    # Copied from MBartForConditionalGeneration
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            # cached cross_attention states don't have to be reordered -> they are always the same
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
-            )
-        return reordered_past
-
-
 @add_start_docstrings(
     "The MBART Model with a language modeling head. Can be used for summarization.", MBART_START_DOCSTRING
 )
diff --git a/src/transformers/models/roberta/__init__.py b/src/transformers/models/roberta/__init__.py
index 75d95a0972..91058cf040 100644
--- a/src/transformers/models/roberta/__init__.py
+++ b/src/transformers/models/roberta/__init__.py
@@ -39,7 +39,6 @@
         "RobertaForSequenceClassification",
         "RobertaForTokenClassification",
         "RobertaModel",
-        "RobertaModelWithHeads",
         "RobertaPreTrainedModel",
     ]
 
@@ -86,7 +85,6 @@
             RobertaForSequenceClassification,
             RobertaForTokenClassification,
             RobertaModel,
-            RobertaModelWithHeads,
             RobertaPreTrainedModel,
         )
 
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index cc543e9482..897c76c547 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -25,16 +25,10 @@
 
 from ...activations import ACT2FN, gelu
 from ...adapters.composition import adjust_tensors_for_parallel
-from ...adapters.context import AdapterSetup, ForwardContext
+from ...adapters.context import ForwardContext
+from ...adapters.mixins.bert import BertModelAdaptersMixin, BertOutputAdaptersMixin, BertSelfOutputAdaptersMixin
 from ...adapters.model_mixin import ModelWithHeadsAdaptersMixin
-from ...adapters.models.bert import (
-    BertModelAdaptersMixin,
-    BertModelHeadsMixin,
-    BertOutputAdaptersMixin,
-    BertSelfOutputAdaptersMixin,
-)
 from ...file_utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -898,86 +892,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    """Roberta Model transformer with the option to add multiple flexible heads on top.""",
-    ROBERTA_START_DOCSTRING,
-)
-class RobertaModelWithHeads(BertModelHeadsMixin, RobertaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.roberta = RobertaModel(config)
-
-        self._init_head_modules()
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="roberta-base",
-        output_type=ModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-            if inputs_embeds is not None
-            else None
-        )
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # BERT & RoBERTa return the pooled output as second item, we don't need that in these heads
-        if not return_dict:
-            head_inputs = (outputs[0],) + outputs[2:]
-        else:
-            head_inputs = outputs
-        pooled_output = outputs[1]
-
-        if head or AdapterSetup.get_context_head_setup() or self.active_head:
-            head_outputs = self.forward_head(
-                head_inputs,
-                head_name=head,
-                attention_mask=attention_mask,
-                return_dict=return_dict,
-                pooled_output=pooled_output,
-                **kwargs,
-            )
-            return head_outputs
-        else:
-            # in case no head is used just return the output of the base model (including pooler output)
-            return outputs
-
-
 @add_start_docstrings(
     """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning. """, ROBERTA_START_DOCSTRING
 )
diff --git a/src/transformers/models/t5/__init__.py b/src/transformers/models/t5/__init__.py
index 53d5946730..0b6e8f8ac4 100644
--- a/src/transformers/models/t5/__init__.py
+++ b/src/transformers/models/t5/__init__.py
@@ -44,7 +44,6 @@
         "T5EncoderModel",
         "T5ForConditionalGeneration",
         "T5Model",
-        "T5ModelWithHeads",
         "T5PreTrainedModel",
         "load_tf_weights_in_t5",
     ]
@@ -81,7 +80,6 @@
             T5EncoderModel,
             T5ForConditionalGeneration,
             T5Model,
-            T5ModelWithHeads,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
         )
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index ef31107730..5a325d8ede 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -28,14 +28,13 @@
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
-from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
-from ...adapters.models.t5 import (
+from ...adapters.mixins.t5 import (
     T5CrossAttentionLayerAdaptersMixin,
     T5FFLayerAdaptersMixin,
     T5ModelAdaptersMixin,
-    T5ModelHeadsMixin,
     T5SelfAttentionLayerAdaptersMixin,
 )
+from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
 from ...file_utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1862,149 +1861,3 @@ def forward(
         )
 
         return encoder_outputs
-
-
-class T5ModelWithHeads(T5ModelHeadsMixin, T5PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.transformer = T5Model(config)
-
-        self._init_head_modules()
-        self._init_adapter_modules()
-        self.init_weights()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    def get_encoder(self):
-        return self.transformer.encoder
-
-    def get_decoder(self):
-        return self.transformer.decoder
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-            # get decoder inputs from shifting lm labels to the right
-            decoder_input_ids = self._shift_right(labels)
-
-        model_output = self.transformer(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = model_output[0]
-        # ToDo move head to device for parallel forward pass
-
-        if self.config.tie_word_embeddings:
-            # Rescale output before projecting on vocab
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            new_hidden_state = sequence_output * (self.config.d_model ** -0.5)
-            if isinstance(model_output, tuple):
-                model_output = (new_hidden_state,) + model_output[1:]
-            else:
-                model_output["last_hidden_state"] = new_hidden_state
-
-        if head or self.active_head:
-            kwargs["labels"] = labels
-            head_outputs = self.forward_head(
-                model_output,
-                head_name=head,
-                return_dict=return_dict,
-                **kwargs,
-            )
-            return head_outputs
-        else:
-            return model_output
-
-    # Copied from T5ForConditionalGeneration
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-    ):
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            "decoder_input_ids": input_ids,
-            "past_key_values": past,
-            "encoder_outputs": encoder_outputs,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,
-        }
-
-    # Copied from T5ForConditionalGeneration
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return self._shift_right(labels)
-
-    # Copied from T5ForConditionalGeneration
-    def _reorder_cache(self, past, beam_idx):
-        # if decoder past is not included in output
-        # speedy decoding is disabled and no need to reorder
-        if past is None:
-            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
-            return past
-
-        reordered_decoder_past = ()
-        for layer_past_states in past:
-            # get the correct batch idx from layer past batch dim
-            # batch dim of `past` is at 2nd position
-            reordered_layer_past_states = ()
-            for layer_past_state in layer_past_states:
-                # need to set correct `past` for each of the four key / value states
-                reordered_layer_past_states = reordered_layer_past_states + (
-                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
-                )
-
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
-
-            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
-        return reordered_decoder_past
diff --git a/src/transformers/models/xlm_roberta/__init__.py b/src/transformers/models/xlm_roberta/__init__.py
index e2d30f699e..7ef5dd2c03 100644
--- a/src/transformers/models/xlm_roberta/__init__.py
+++ b/src/transformers/models/xlm_roberta/__init__.py
@@ -51,7 +51,6 @@
         "XLMRobertaForSequenceClassification",
         "XLMRobertaForTokenClassification",
         "XLMRobertaModel",
-        "XLMRobertaModelWithHeads",
     ]
 
 if is_tf_available():
@@ -89,7 +88,6 @@
             XLMRobertaForSequenceClassification,
             XLMRobertaForTokenClassification,
             XLMRobertaModel,
-            XLMRobertaModelWithHeads,
         )
 
     if is_tf_available():
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 90f879f766..edcf151878 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -25,7 +25,6 @@
     RobertaForSequenceClassification,
     RobertaForTokenClassification,
     RobertaModel,
-    RobertaModelWithHeads,
 )
 from .configuration_xlm_roberta import XLMRobertaConfig
 
@@ -74,19 +73,6 @@ class XLMRobertaModel(RobertaModel):
     config_class = XLMRobertaConfig
 
 
-@add_start_docstrings(
-    """XLM-RoBERTa Model with the option to add multiple flexible heads on top.""",
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class XLMRobertaModelWithHeads(RobertaModelWithHeads):
-    """
-    This class overrides :class:`~transformers.RobertaModelWithHeads`. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-
-
 @add_start_docstrings(
     "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
     XLM_ROBERTA_START_DOCSTRING,

From 1a56b6499b54844551c0e9171d0ff2d43e7d460c Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Wed, 16 Feb 2022 12:59:18 +0100
Subject: [PATCH 02/10] Refactor adapter test classes. Replace
 `MODEL_WITH_HEADS_MAPPING` w. `ADAPTER_MODEL_MAPPING`.

---
 .github/workflows/tests_torch.yml             |  2 +-
 Makefile                                      | 12 +---
 src/transformers/__init__.py                  |  5 +-
 src/transformers/adapters/__init__.py         |  3 +-
 src/transformers/adapters/models/auto.py      | 16 ++++-
 src/transformers/modeling_utils.py            | 10 +--
 src/transformers/pipelines/base.py            |  4 +-
 tests/test_modeling_auto.py                   |  2 -
 tests/test_modeling_bart.py                   |  9 +--
 tests/test_modeling_bert.py                   |  2 -
 tests/test_modeling_common.py                 | 12 +---
 tests/test_modeling_distilbert.py             |  2 -
 tests/test_modeling_gpt2.py                   | 10 +--
 tests/test_modeling_mbart.py                  |  9 +--
 tests/test_modeling_roberta.py                |  2 -
 tests/test_modeling_t5.py                     | 14 +---
 tests_adapters/__init__.py                    |  0
 tests_adapters/conftest.py                    | 65 +++++++++++++++++++
 .../extended/test_adapter_trainer_ext.py      |  0
 .../fixtures/SiBERT/config.json               |  0
 .../fixtures/SiBERT/special_tokens_map.json   |  0
 .../fixtures/SiBERT/tokenizer_config.json     |  0
 .../fixtures/SiBERT/vocab.txt                 |  0
 .../fixtures/hub-index.sample.json            |  0
 {tests => tests_adapters}/test_adapter.py     |  2 +-
 .../test_adapter_common.py                    |  6 +-
 .../test_adapter_composition.py               |  4 +-
 .../test_adapter_config.py                    |  0
 .../test_adapter_conversion.py                |  0
 .../test_adapter_custom_head.py               |  0
 .../test_adapter_embeddings.py                | 15 +++--
 .../test_adapter_fusion_common.py             |  4 +-
 .../test_adapter_fusion_config.py             |  0
 .../test_adapter_heads.py                     | 34 +++++-----
 {tests => tests_adapters}/test_adapter_hub.py |  3 +-
 .../test_adapter_save_id2label.py             |  0
 .../test_adapter_setup_context.py             |  0
 .../test_adapter_trainer.py                   |  0
 .../test_adapter_training.py                  |  0
 tests_adapters/test_bart.py                   | 12 ++++
 tests_adapters/test_bert.py                   | 12 ++++
 tests_adapters/test_common.py                 | 13 ++++
 tests_adapters/test_distilbert.py             | 12 ++++
 tests_adapters/test_encoder_decoder.py        |  1 +
 tests_adapters/test_gpt2.py                   | 12 ++++
 tests_adapters/test_mbart.py                  | 12 ++++
 tests_adapters/test_roberta.py                | 12 ++++
 tests_adapters/test_t5.py                     | 12 ++++
 utils/run_tests.py                            | 37 -----------
 49 files changed, 234 insertions(+), 148 deletions(-)
 create mode 100644 tests_adapters/__init__.py
 create mode 100644 tests_adapters/conftest.py
 rename {tests => tests_adapters}/extended/test_adapter_trainer_ext.py (100%)
 rename {tests => tests_adapters}/fixtures/SiBERT/config.json (100%)
 rename {tests => tests_adapters}/fixtures/SiBERT/special_tokens_map.json (100%)
 rename {tests => tests_adapters}/fixtures/SiBERT/tokenizer_config.json (100%)
 rename {tests => tests_adapters}/fixtures/SiBERT/vocab.txt (100%)
 rename {tests => tests_adapters}/fixtures/hub-index.sample.json (100%)
 rename {tests => tests_adapters}/test_adapter.py (99%)
 rename {tests => tests_adapters}/test_adapter_common.py (98%)
 rename {tests => tests_adapters}/test_adapter_composition.py (99%)
 rename {tests => tests_adapters}/test_adapter_config.py (100%)
 rename {tests => tests_adapters}/test_adapter_conversion.py (100%)
 rename {tests => tests_adapters}/test_adapter_custom_head.py (100%)
 rename {tests => tests_adapters}/test_adapter_embeddings.py (90%)
 rename {tests => tests_adapters}/test_adapter_fusion_common.py (98%)
 rename {tests => tests_adapters}/test_adapter_fusion_config.py (100%)
 rename {tests => tests_adapters}/test_adapter_heads.py (90%)
 rename {tests => tests_adapters}/test_adapter_hub.py (99%)
 rename {tests => tests_adapters}/test_adapter_save_id2label.py (100%)
 rename {tests => tests_adapters}/test_adapter_setup_context.py (100%)
 rename {tests => tests_adapters}/test_adapter_trainer.py (100%)
 rename {tests => tests_adapters}/test_adapter_training.py (100%)
 create mode 100644 tests_adapters/test_bart.py
 create mode 100644 tests_adapters/test_bert.py
 create mode 100644 tests_adapters/test_common.py
 create mode 100644 tests_adapters/test_distilbert.py
 create mode 100644 tests_adapters/test_encoder_decoder.py
 create mode 100644 tests_adapters/test_gpt2.py
 create mode 100644 tests_adapters/test_mbart.py
 create mode 100644 tests_adapters/test_roberta.py
 create mode 100644 tests_adapters/test_t5.py
 delete mode 100644 utils/run_tests.py

diff --git a/.github/workflows/tests_torch.yml b/.github/workflows/tests_torch.yml
index b9483e2e8d..c3fba80ea0 100644
--- a/.github/workflows/tests_torch.yml
+++ b/.github/workflows/tests_torch.yml
@@ -60,4 +60,4 @@ jobs:
           pip install datasets
       - name: Test
         run: |
-          make test-reduced
+          make test-adapters
diff --git a/Makefile b/Makefile
index bcebdd8049..1f1f3fb9eb 100644
--- a/Makefile
+++ b/Makefile
@@ -82,16 +82,8 @@ test:
 
 # Run the adapter tests
 
-test-adapter:
-	python -m pytest -n auto --dist=loadfile -s -v\
-		-k test_adapter\
-		--ignore-glob='tests/test_tokenization*'\
-		--ignore-glob='tests/test_processor*'\
-		./tests/
-
-# Run a reduced test suite in the CI pipeline of adapter-transformers
-test-reduced:
-	python utils/run_tests.py
+test-adapters:
+	python -m pytest -n auto --dist=loadfile -s -v ./tests_adapters/
 
 # Run tests for examples
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f8172f627b..e92509dc59 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -598,7 +598,6 @@
             "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_MAPPING",
-            "MODEL_WITH_HEADS_MAPPING",
             "MODEL_WITH_LM_HEAD_MAPPING",
             "AutoModel",
             "AutoModelForAudioClassification",
@@ -1366,6 +1365,7 @@
         "ADAPTER_MODEL_MAPPING",
         "DEFAULT_ADAPTER_CONFIG",
         "DEFAULT_ADAPTERFUSION_CONFIG",
+        "MODEL_WITH_HEADS_MAPPING",
         "AdapterArguments",
         "AdapterConfig",
         "AdapterFusionConfig",
@@ -2500,7 +2500,6 @@
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_MAPPING,
-            MODEL_WITH_HEADS_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
             AutoModelForAudioClassification,
@@ -3144,6 +3143,7 @@
             ADAPTERFUSION_CONFIG_MAP,
             DEFAULT_ADAPTER_CONFIG,
             DEFAULT_ADAPTERFUSION_CONFIG,
+            MODEL_WITH_HEADS_MAPPING,
             AdapterArguments,
             AdapterConfig,
             AdapterFusionConfig,
@@ -3699,6 +3699,7 @@
         extra_objects={"__version__": __version__, "__adapters_version__": __adapters_version__},
     )
 
+
 if not is_tf_available() and not is_torch_available() and not is_flax_available():
     logger.warning(
         "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. "
diff --git a/src/transformers/adapters/__init__.py b/src/transformers/adapters/__init__.py
index 1f343ae5a7..fe76f34f81 100644
--- a/src/transformers/adapters/__init__.py
+++ b/src/transformers/adapters/__init__.py
@@ -75,6 +75,7 @@
     ],
     "models.auto": [
         "ADAPTER_MODEL_MAPPING",
+        "MODEL_WITH_HEADS_MAPPING",
         "AutoAdapterModel",
         "AutoModelWithHeads",
     ],
@@ -175,7 +176,7 @@
         ModelConfigAdaptersMixin,
         ModelWithHeadsAdaptersMixin,
     )
-    from .models.auto import ADAPTER_MODEL_MAPPING, AutoAdapterModel, AutoModelWithHeads
+    from .models.auto import ADAPTER_MODEL_MAPPING, MODEL_WITH_HEADS_MAPPING, AutoAdapterModel, AutoModelWithHeads
     from .models.bart import BartAdapterModel, BartModelWithHeads
     from .models.bert import BertAdapterModel, BertModelWithHeads
     from .models.distilbert import DistilBertAdapterModel, DistilBertModelWithHeads
diff --git a/src/transformers/adapters/models/auto.py b/src/transformers/adapters/models/auto.py
index 7ed835af65..549c3aeefa 100644
--- a/src/transformers/adapters/models/auto.py
+++ b/src/transformers/adapters/models/auto.py
@@ -5,7 +5,20 @@
 from ...models.auto.configuration_auto import CONFIG_MAPPING_NAMES
 
 
+# Make sure that children are placed before parents!
 ADAPTER_MODEL_MAPPING_NAMES = OrderedDict(
+    [
+        ("xlm-roberta", "XLMRobertaAdapterModel"),
+        ("roberta", "RobertaAdapterModel"),
+        ("bert", "BertAdapterModel"),
+        ("distilbert", "DistilBertAdapterModel"),
+        ("bart", "BartAdapterModel"),
+        ("mbart", "MBartAdapterModel"),
+        ("gpt2", "GPT2AdapterModel"),
+        ("t5", "T5AdapterModel"),
+    ]
+)
+MODEL_WITH_HEADS_MAPPING_NAMES = OrderedDict(
     [
         ("xlm-roberta", "XLMRobertaModelWithHeads"),
         ("roberta", "RobertaModelWithHeads"),
@@ -19,6 +32,7 @@
 )
 
 ADAPTER_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, ADAPTER_MODEL_MAPPING_NAMES)
+MODEL_WITH_HEADS_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_HEADS_MAPPING_NAMES)
 
 
 class AutoAdapterModel(_BaseAutoModelClass):
@@ -29,7 +43,7 @@ class AutoAdapterModel(_BaseAutoModelClass):
 
 
 class AutoModelWithHeads(_BaseAutoModelClass):
-    _model_mapping = ADAPTER_MODEL_MAPPING
+    _model_mapping = MODEL_WITH_HEADS_MAPPING
 
     @classmethod
     def from_config(cls, config):
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 26cd92e355..0af13aa35b 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2292,7 +2292,7 @@ def prune_layer(
 
 
 def apply_chunking_to_forward(
-    forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors, **kwargs
+    forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
 ) -> torch.Tensor:
     """
     This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
@@ -2330,11 +2330,7 @@ def forward(self, hidden_states):
     assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
 
     # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
-    forward_fn_params = inspect.signature(forward_fn).parameters
-    num_args_in_forward_chunk_fn = len(forward_fn_params)
-    # subtract one for kwargs
-    if "kwargs" in forward_fn_params:
-        num_args_in_forward_chunk_fn -= 1
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
     if num_args_in_forward_chunk_fn != len(input_tensors):
         raise ValueError(
             f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
@@ -2365,4 +2361,4 @@ def forward(self, hidden_states):
         # concatenate output at same dimension
         return torch.cat(output_chunks, dim=chunk_dim)
 
-    return forward_fn(*input_tensors, **kwargs)
+    return forward_fn(*input_tensors)
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index ddbec9ad88..7588eabad0 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -27,11 +27,11 @@
 
 from packaging import version
 
+from ..adapters.models.auto import ADAPTER_MODEL_MAPPING
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
 from ..file_utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available
 from ..modelcard import ModelCard
 from ..models.auto.configuration_auto import AutoConfig
-from ..models.auto.modeling_auto import MODEL_WITH_HEADS_MAPPING
 from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import logging
 
@@ -815,7 +815,7 @@ def check_model_type(self, supported_models: Union[List[str], dict]):
                 else:
                     supported_models_names.append(model.__name__)
             supported_models = supported_models_names
-        for item in MODEL_WITH_HEADS_MAPPING.values():
+        for item in ADAPTER_MODEL_MAPPING.values():
             supported_models.append(item.__name__)
         if self.model.__class__.__name__ not in supported_models:
             logger.error(
diff --git a/tests/test_modeling_auto.py b/tests/test_modeling_auto.py
index c21dd69957..ea5cbff620 100644
--- a/tests/test_modeling_auto.py
+++ b/tests/test_modeling_auto.py
@@ -73,7 +73,6 @@
         MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
-        MODEL_WITH_HEADS_MAPPING,
         MODEL_WITH_LM_HEAD_MAPPING,
     )
     from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -297,7 +296,6 @@ def test_parents_and_children_in_mappings(self):
 
         mappings = (
             MODEL_MAPPING,
-            MODEL_WITH_HEADS_MAPPING,
             MODEL_FOR_PRETRAINING_MAPPING,
             MODEL_FOR_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 27505dfa94..957350b824 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -40,7 +40,6 @@
         BartForQuestionAnswering,
         BartForSequenceClassification,
         BartModel,
-        BartModelWithHeads,
         BartTokenizer,
         pipeline,
     )
@@ -408,13 +407,7 @@ def _get_embs(m):
 @require_torch
 class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (
-            BartModel,
-            BartModelWithHeads,
-            BartForConditionalGeneration,
-            BartForSequenceClassification,
-            BartForQuestionAnswering,
-        )
+        (BartModel, BartForConditionalGeneration, BartForSequenceClassification, BartForQuestionAnswering)
         if is_torch_available()
         else ()
     )
diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py
index e763025aad..7b7f02a553 100644
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -39,7 +39,6 @@
         BertForTokenClassification,
         BertLMHeadModel,
         BertModel,
-        BertModelWithHeads,
     )
     from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
 
@@ -432,7 +431,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             BertModel,
-            BertModelWithHeads,
             BertLMHeadModel,
             BertForMaskedLM,
             BertForMultipleChoice,
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index f367c7c6b0..1cb39e799e 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -65,11 +65,9 @@
         MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
-        MODEL_WITH_HEADS_MAPPING,
         AdaptiveEmbedding,
         BertConfig,
         BertModel,
-        ModelWithHeadsAdaptersMixin,
         PretrainedConfig,
         PreTrainedModel,
         T5Config,
@@ -412,7 +410,7 @@ def test_training(self):
         config.return_dict = True
 
         for model_class in self.all_model_classes:
-            if model_class in get_values(MODEL_MAPPING) or model_class in get_values(MODEL_WITH_HEADS_MAPPING):
+            if model_class in get_values(MODEL_MAPPING):
                 continue
             model = model_class(config)
             model.to(torch_device)
@@ -430,11 +428,7 @@ def test_training_gradient_checkpointing(self):
         config.return_dict = True
 
         for model_class in self.all_model_classes:
-            if (
-                model_class in get_values(MODEL_MAPPING)
-                or model_class in get_values(MODEL_WITH_HEADS_MAPPING)
-                or not model_class.supports_gradient_checkpointing
-            ):
+            if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing:
                 continue
             model = model_class(config)
             model.to(torch_device)
@@ -1331,7 +1325,7 @@ def test_correct_missing_keys(self):
             model = model_class(config)
             base_model_prefix = model.base_model_prefix
 
-            if hasattr(model, base_model_prefix) and not isinstance(model, ModelWithHeadsAdaptersMixin):
+            if hasattr(model, base_model_prefix):
                 with tempfile.TemporaryDirectory() as temp_dir_name:
                     model.base_model.save_pretrained(temp_dir_name)
                     model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py
index 7434f10470..ed7fba94bb 100644
--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -34,7 +34,6 @@
         DistilBertForSequenceClassification,
         DistilBertForTokenClassification,
         DistilBertModel,
-        DistilBertModelWithHeads,
     )
 
 
@@ -201,7 +200,6 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DistilBertModel,
-            DistilBertModelWithHeads,
             DistilBertForMaskedLM,
             DistilBertForMultipleChoice,
             DistilBertForQuestionAnswering,
diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
index 3de5cdb802..462c6456d2 100644
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -36,7 +36,6 @@
         GPT2ForTokenClassification,
         GPT2LMHeadModel,
         GPT2Model,
-        GPT2ModelWithHeads,
         GPT2Tokenizer,
     )
 
@@ -429,14 +428,7 @@ def prepare_config_and_inputs_for_common(self):
 class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
 
     all_model_classes = (
-        (
-            GPT2Model,
-            GPT2LMHeadModel,
-            GPT2DoubleHeadsModel,
-            GPT2ForSequenceClassification,
-            GPT2ForTokenClassification,
-            GPT2ModelWithHeads,
-        )
+        (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, GPT2ForTokenClassification)
         if is_torch_available()
         else ()
     )
diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py
index 368cf5d5b4..229eb96e90 100644
--- a/tests/test_modeling_mbart.py
+++ b/tests/test_modeling_mbart.py
@@ -39,7 +39,6 @@
         MBartForQuestionAnswering,
         MBartForSequenceClassification,
         MBartModel,
-        MBartModelWithHeads,
     )
     from transformers.models.mbart.modeling_mbart import MBartDecoder, MBartEncoder
 
@@ -219,13 +218,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 @require_torch
 class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (
-            MBartModel,
-            MBartModelWithHeads,
-            MBartForConditionalGeneration,
-            MBartForSequenceClassification,
-            MBartForQuestionAnswering,
-        )
+        (MBartModel, MBartForConditionalGeneration, MBartForSequenceClassification, MBartForQuestionAnswering)
         if is_torch_available()
         else ()
     )
diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py
index f451bdf00f..0f700009e0 100644
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -36,7 +36,6 @@
         RobertaForSequenceClassification,
         RobertaForTokenClassification,
         RobertaModel,
-        RobertaModelWithHeads,
     )
     from transformers.models.roberta.modeling_roberta import (
         ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -348,7 +347,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
             RobertaForCausalLM,
             RobertaForMaskedLM,
             RobertaModel,
-            RobertaModelWithHeads,
             RobertaForSequenceClassification,
             RobertaForTokenClassification,
             RobertaForMultipleChoice,
diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py
index 60c378b139..575850aa90 100644
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -18,7 +18,7 @@
 import tempfile
 import unittest
 
-from transformers import is_torch_available
+from transformers import T5Config, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
@@ -30,15 +30,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import (
-        ByT5Tokenizer,
-        T5Config,
-        T5EncoderModel,
-        T5ForConditionalGeneration,
-        T5Model,
-        T5ModelWithHeads,
-        T5Tokenizer,
-    )
+    from transformers import ByT5Tokenizer, T5EncoderModel, T5ForConditionalGeneration, T5Model, T5Tokenizer
     from transformers.models.t5.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
@@ -515,7 +507,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
 
-    all_model_classes = (T5Model, T5ForConditionalGeneration, T5ModelWithHeads) if is_torch_available() else ()
+    all_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else ()
     fx_ready_model_classes = all_model_classes
     all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
diff --git a/tests_adapters/__init__.py b/tests_adapters/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests_adapters/conftest.py b/tests_adapters/conftest.py
new file mode 100644
index 0000000000..03c9ed1315
--- /dev/null
+++ b/tests_adapters/conftest.py
@@ -0,0 +1,65 @@
+# Copyright 2022 The AdapterHub Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tests directory-specific settings - this file is run automatically
+# by pytest before any tests are run
+
+import sys
+import warnings
+from os.path import abspath, dirname, join
+
+
+# allow having multiple repository checkouts and not needing to remember to rerun
+# 'pip install -e .[dev]' when switching between checkouts and running tests.
+git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
+sys.path.insert(1, git_repo_path)
+
+# add original tests of HF to path
+hf_tests_path = abspath(join(dirname(dirname(__file__))))
+sys.path.insert(1, hf_tests_path)
+
+# silence FutureWarning warnings in tests since often we can't act on them until
+# they become normal warnings - i.e. the tests still need to test the current functionality
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipeline are tested")
+    config.addinivalue_line(
+        "markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
+    )
+    config.addinivalue_line(
+        "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
+    )
+    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
+
+
+def pytest_addoption(parser):
+    from transformers.testing_utils import pytest_addoption_shared
+
+    pytest_addoption_shared(parser)
+
+
+def pytest_terminal_summary(terminalreporter):
+    from transformers.testing_utils import pytest_terminal_summary_main
+
+    make_reports = terminalreporter.config.getoption("--make-reports")
+    if make_reports:
+        pytest_terminal_summary_main(terminalreporter, id=make_reports)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    # If no tests are collected, pytest exists with code 5, which makes the CI fail.
+    if exitstatus == 5:
+        session.exitstatus = 0
diff --git a/tests/extended/test_adapter_trainer_ext.py b/tests_adapters/extended/test_adapter_trainer_ext.py
similarity index 100%
rename from tests/extended/test_adapter_trainer_ext.py
rename to tests_adapters/extended/test_adapter_trainer_ext.py
diff --git a/tests/fixtures/SiBERT/config.json b/tests_adapters/fixtures/SiBERT/config.json
similarity index 100%
rename from tests/fixtures/SiBERT/config.json
rename to tests_adapters/fixtures/SiBERT/config.json
diff --git a/tests/fixtures/SiBERT/special_tokens_map.json b/tests_adapters/fixtures/SiBERT/special_tokens_map.json
similarity index 100%
rename from tests/fixtures/SiBERT/special_tokens_map.json
rename to tests_adapters/fixtures/SiBERT/special_tokens_map.json
diff --git a/tests/fixtures/SiBERT/tokenizer_config.json b/tests_adapters/fixtures/SiBERT/tokenizer_config.json
similarity index 100%
rename from tests/fixtures/SiBERT/tokenizer_config.json
rename to tests_adapters/fixtures/SiBERT/tokenizer_config.json
diff --git a/tests/fixtures/SiBERT/vocab.txt b/tests_adapters/fixtures/SiBERT/vocab.txt
similarity index 100%
rename from tests/fixtures/SiBERT/vocab.txt
rename to tests_adapters/fixtures/SiBERT/vocab.txt
diff --git a/tests/fixtures/hub-index.sample.json b/tests_adapters/fixtures/hub-index.sample.json
similarity index 100%
rename from tests/fixtures/hub-index.sample.json
rename to tests_adapters/fixtures/hub-index.sample.json
diff --git a/tests/test_adapter.py b/tests_adapters/test_adapter.py
similarity index 99%
rename from tests/test_adapter.py
rename to tests_adapters/test_adapter.py
index e09fefc41b..a6ed413303 100644
--- a/tests/test_adapter.py
+++ b/tests_adapters/test_adapter.py
@@ -4,7 +4,6 @@
 import torch
 from datasets import load_dataset
 
-from tests.test_adapter_embeddings import EmbeddingTestMixin
 from transformers import (
     AutoModel,
     AutoModelForSeq2SeqLM,
@@ -26,6 +25,7 @@
 from .test_adapter_common import AdapterModelTestMixin
 from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
 from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
 from .test_adapter_fusion_common import AdapterFusionModelTestMixin
 from .test_adapter_heads import PredictionHeadModelTestMixin
 from .test_adapter_training import AdapterTrainingTestMixin
diff --git a/tests/test_adapter_common.py b/tests_adapters/test_adapter_common.py
similarity index 98%
rename from tests/test_adapter_common.py
rename to tests_adapters/test_adapter_common.py
index c02d3a78bb..bfab201361 100644
--- a/tests/test_adapter_common.py
+++ b/tests_adapters/test_adapter_common.py
@@ -5,7 +5,7 @@
 
 from transformers import (
     ADAPTER_CONFIG_MAP,
-    MODEL_WITH_HEADS_MAPPING,
+    ADAPTER_MODEL_MAPPING,
     AdapterSetup,
     AutoModelWithHeads,
     HoulsbyConfig,
@@ -258,7 +258,7 @@ def test_model_config_serialization(self):
             model.config.to_json_string()
 
     def test_loading_adapter_weights_with_prefix(self):
-        if self.config_class not in MODEL_WITH_HEADS_MAPPING:
+        if self.config_class not in ADAPTER_MODEL_MAPPING:
             self.skipTest("Does not support flex heads.")
 
         model_base, model_with_head_base = create_twin_models(self.model_class, self.config)
@@ -287,7 +287,7 @@ def test_loading_adapter_weights_with_prefix(self):
         self.assertTrue(torch.equal(output1[0], output2[0]))
 
     def test_loading_adapter_weights_without_prefix(self):
-        if self.config_class not in MODEL_WITH_HEADS_MAPPING:
+        if self.config_class not in ADAPTER_MODEL_MAPPING:
             self.skipTest("Does not support flex heads.")
 
         model_base, model_with_head_base = create_twin_models(self.model_class, self.config)
diff --git a/tests/test_adapter_composition.py b/tests_adapters/test_adapter_composition.py
similarity index 99%
rename from tests/test_adapter_composition.py
rename to tests_adapters/test_adapter_composition.py
index 024ccc734a..ea8166ad62 100644
--- a/tests/test_adapter_composition.py
+++ b/tests_adapters/test_adapter_composition.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from tests.test_adapter_training import filter_parameters
+from tests.test_modeling_common import ids_tensor
 from transformers import (
     MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
     AutoModelWithHeads,
@@ -18,7 +18,7 @@
 from transformers.adapters.composition import BatchSplit, Fuse, Parallel, Split, Stack, parse_composition
 from transformers.testing_utils import require_torch, torch_device
 
-from .test_modeling_common import ids_tensor
+from .test_adapter_training import filter_parameters
 
 
 class AdapterCompositionParsingTest(unittest.TestCase):
diff --git a/tests/test_adapter_config.py b/tests_adapters/test_adapter_config.py
similarity index 100%
rename from tests/test_adapter_config.py
rename to tests_adapters/test_adapter_config.py
diff --git a/tests/test_adapter_conversion.py b/tests_adapters/test_adapter_conversion.py
similarity index 100%
rename from tests/test_adapter_conversion.py
rename to tests_adapters/test_adapter_conversion.py
diff --git a/tests/test_adapter_custom_head.py b/tests_adapters/test_adapter_custom_head.py
similarity index 100%
rename from tests/test_adapter_custom_head.py
rename to tests_adapters/test_adapter_custom_head.py
diff --git a/tests/test_adapter_embeddings.py b/tests_adapters/test_adapter_embeddings.py
similarity index 90%
rename from tests/test_adapter_embeddings.py
rename to tests_adapters/test_adapter_embeddings.py
index 568e561c41..b3e64dc532 100644
--- a/tests/test_adapter_embeddings.py
+++ b/tests_adapters/test_adapter_embeddings.py
@@ -3,10 +3,11 @@
 
 import torch
 
-from tests.test_adapter_training import filter_parameters
 from transformers import AutoModelWithHeads, AutoTokenizer, Trainer, TrainingArguments
 from transformers.testing_utils import require_torch, torch_device
 
+from .test_adapter_training import filter_parameters
+
 
 @require_torch
 class EmbeddingTestMixin:
@@ -20,13 +21,13 @@ def test_load_embeddings(self):
 
     def test_add_embeddings(self):
         model = self.get_model()
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         model.add_embeddings("test", tokenizer)
         self.assertEqual(model.active_embeddings, "test")
 
     def test_delete_embeddings(self):
         model = self.get_model()
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         model.add_embeddings("test", tokenizer)
         self.assertEqual(model.active_embeddings, "test")
         model.delete_embeddings("test")
@@ -35,7 +36,7 @@ def test_delete_embeddings(self):
 
     def test_save_load_embedding(self):
         model = self.get_model()
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         input_data = self.get_input_samples((1, 128), vocab_size=tokenizer.vocab_size, config=model.config)
         model.add_embeddings("test", tokenizer)
         model.eval()
@@ -61,7 +62,7 @@ def test_back_to_default(self):
         model.eval()
         input_data = self.get_input_samples((1, 128), config=model.config)
         output1 = model(**input_data)
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         model.add_embeddings("test", tokenizer)
         self.assertEqual(model.active_embeddings, "test")
         model.set_active_embeddings("default")
@@ -74,7 +75,7 @@ def test_training_embedding(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         model.add_adapter("test")
         self.add_head(model, "test")
         model.train_adapter("test", train_embeddings=True)
@@ -124,7 +125,7 @@ def test_reference_embedding(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        new_tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        new_tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
 
         model.add_embeddings("test", new_tokenizer, "default", tokenizer)
 
diff --git a/tests/test_adapter_fusion_common.py b/tests_adapters/test_adapter_fusion_common.py
similarity index 98%
rename from tests/test_adapter_fusion_common.py
rename to tests_adapters/test_adapter_fusion_common.py
index 2d6aa3d74e..10fc560c1f 100644
--- a/tests/test_adapter_fusion_common.py
+++ b/tests_adapters/test_adapter_fusion_common.py
@@ -6,8 +6,8 @@
 import torch
 
 from transformers import (
+    ADAPTER_MODEL_MAPPING,
     ADAPTERFUSION_CONFIG_MAP,
-    MODEL_WITH_HEADS_MAPPING,
     AdapterConfig,
     AutoModelWithHeads,
     PfeifferConfig,
@@ -163,7 +163,7 @@ def test_model_config_serialization_fusion(self):
             model.config.to_json_string()
 
     def test_adapter_fusion_save_with_head(self):
-        if self.config_class not in MODEL_WITH_HEADS_MAPPING:
+        if self.config_class not in ADAPTER_MODEL_MAPPING:
             self.skipTest("Does not support flex heads.")
         model1 = AutoModelWithHeads.from_config(self.config())
         model1.eval()
diff --git a/tests/test_adapter_fusion_config.py b/tests_adapters/test_adapter_fusion_config.py
similarity index 100%
rename from tests/test_adapter_fusion_config.py
rename to tests_adapters/test_adapter_fusion_config.py
diff --git a/tests/test_adapter_heads.py b/tests_adapters/test_adapter_heads.py
similarity index 90%
rename from tests/test_adapter_heads.py
rename to tests_adapters/test_adapter_heads.py
index 3ed4f1485f..1e7b1e77f9 100644
--- a/tests/test_adapter_heads.py
+++ b/tests_adapters/test_adapter_heads.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from transformers import MODEL_WITH_HEADS_MAPPING, AdapterSetup, AutoModelForSequenceClassification, AutoModelWithHeads
+from transformers import ADAPTER_MODEL_MAPPING, AdapterSetup, AutoModelForSequenceClassification, AutoModelWithHeads
 from transformers.adapters.composition import BatchSplit, Stack
 from transformers.testing_utils import require_torch, torch_device
 
@@ -52,7 +52,7 @@ def run_prediction_head_test(
         self.assertTrue(torch.equal(output1[idx], output2[idx]))
 
     def test_classification_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head")
 
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
@@ -63,7 +63,7 @@ def test_classification_head(self):
         self.run_prediction_head_test(model1, model2, "dummy", label_dict=label_dict)
 
     def test_multiple_choice_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_multiple_choice_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_multiple_choice_head"):
             self.skipTest("No multiple choice head")
 
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
@@ -76,7 +76,7 @@ def test_multiple_choice_head(self):
         )
 
     def test_tagging_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_tagging_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_tagging_head"):
             self.skipTest("No tagging head")
 
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
@@ -89,7 +89,7 @@ def test_tagging_head(self):
         )
 
     def test_qa_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_qa_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_qa_head"):
             self.skipTest("No QA head")
 
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
@@ -103,7 +103,7 @@ def test_qa_head(self):
         )
 
     def test_causal_lm_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_causal_lm_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_causal_lm_head"):
             self.skipTest("No causal language model head")
 
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
@@ -121,7 +121,7 @@ def test_causal_lm_head(self):
         )
 
     def test_seq2seq_lm_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_seq2seq_lm_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_seq2seq_lm_head"):
             self.skipTest("No seq2seq language model head")
 
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
@@ -152,7 +152,7 @@ def test_seq2seq_lm_head(self):
         self.assertEqual(generated.shape, (1, seq_output_length))
 
     def test_masked_lm_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_masked_lm_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_masked_lm_head"):
             self.skipTest("No causal or seq2seq language model head")
 
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
@@ -169,7 +169,7 @@ def test_masked_lm_head(self):
         )
 
     def test_dependency_parsing_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_dependency_parsing_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_dependency_parsing_head"):
             self.skipTest("No dependency parsing head")
 
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
@@ -205,7 +205,7 @@ def test_delete_head(self):
         self.assertNotEqual(name, model.active_head)
 
     def test_adapter_with_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
 
@@ -229,7 +229,7 @@ def test_adapter_with_head(self):
         self.assertEqual(3, output1[0].size()[1])
 
     def test_adapter_with_head_load_as(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
         model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
 
@@ -268,7 +268,7 @@ def test_load_full_model(self):
         self.assertDictEqual(true_config, model.get_prediction_heads_config())
 
     def test_batch_split_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
         model = AutoModelWithHeads.from_config(self.config())
         model.add_classification_head("a")
@@ -300,7 +300,7 @@ def test_batch_split_adapter_head(self):
         self.assertTrue(isinstance(model.active_head, BatchSplit))
 
     def test_reload_static_to_flex_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
         static_head_model = AutoModelForSequenceClassification.from_config(self.config())
         flex_head_model = AutoModelWithHeads.from_pretrained(
@@ -336,11 +336,11 @@ def test_reload_static_to_flex_head(self):
         self.assertTrue(torch.all(torch.isclose(output1.logits, output2.logits)))
 
     def test_invertible_adapter_with_head(self):
-        if hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_masked_lm_head"):
+        if hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_masked_lm_head"):
             lm_head = "masked_lm"
-        elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_causal_lm_head"):
+        elif hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_causal_lm_head"):
             lm_head = "casual_lm"
-        elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_seq2seq_lm_head"):
+        elif hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_seq2seq_lm_head"):
             lm_head = "seq2seq_lm"
         else:
             self.skipTest("No masked or causel language model head")
@@ -377,7 +377,7 @@ def forward_pre_hook(module, input):
         self.assertEqual(2, calls)
 
     def test_context_simple(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
         model = AutoModelWithHeads.from_config(self.config())
         model.add_adapter("a")
diff --git a/tests/test_adapter_hub.py b/tests_adapters/test_adapter_hub.py
similarity index 99%
rename from tests/test_adapter_hub.py
rename to tests_adapters/test_adapter_hub.py
index bd02cd8b99..1235e93b6f 100644
--- a/tests/test_adapter_hub.py
+++ b/tests_adapters/test_adapter_hub.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 
+from tests.test_modeling_common import ids_tensor
 from transformers import (  # get_adapter_config_hash,
     ADAPTER_CONFIG_MAP,
     AdapterConfig,
@@ -20,8 +21,6 @@
 from transformers.adapters.utils import find_in_index
 from transformers.testing_utils import require_torch, torch_device
 
-from .test_modeling_common import ids_tensor
-
 
 SAMPLE_INDEX = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/hub-index.sample.json")
 
diff --git a/tests/test_adapter_save_id2label.py b/tests_adapters/test_adapter_save_id2label.py
similarity index 100%
rename from tests/test_adapter_save_id2label.py
rename to tests_adapters/test_adapter_save_id2label.py
diff --git a/tests/test_adapter_setup_context.py b/tests_adapters/test_adapter_setup_context.py
similarity index 100%
rename from tests/test_adapter_setup_context.py
rename to tests_adapters/test_adapter_setup_context.py
diff --git a/tests/test_adapter_trainer.py b/tests_adapters/test_adapter_trainer.py
similarity index 100%
rename from tests/test_adapter_trainer.py
rename to tests_adapters/test_adapter_trainer.py
diff --git a/tests/test_adapter_training.py b/tests_adapters/test_adapter_training.py
similarity index 100%
rename from tests/test_adapter_training.py
rename to tests_adapters/test_adapter_training.py
diff --git a/tests_adapters/test_bart.py b/tests_adapters/test_bart.py
new file mode 100644
index 0000000000..5d34ec9c6f
--- /dev/null
+++ b/tests_adapters/test_bart.py
@@ -0,0 +1,12 @@
+from tests.test_modeling_bart import *
+from transformers import BartAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class BartAdapterModelTest(AdapterModelTesterMixin, BartModelTest):
+    all_model_classes = (
+        BartAdapterModel,
+    )
diff --git a/tests_adapters/test_bert.py b/tests_adapters/test_bert.py
new file mode 100644
index 0000000000..2eef1d7a09
--- /dev/null
+++ b/tests_adapters/test_bert.py
@@ -0,0 +1,12 @@
+from tests.test_modeling_bert import *
+from transformers import BertAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class BertAdapterModelTest(AdapterModelTesterMixin, BertModelTest):
+    all_model_classes = (
+        BertAdapterModel,
+    )
diff --git a/tests_adapters/test_common.py b/tests_adapters/test_common.py
new file mode 100644
index 0000000000..272f176092
--- /dev/null
+++ b/tests_adapters/test_common.py
@@ -0,0 +1,13 @@
+from transformers.testing_utils import require_torch
+
+
+@require_torch
+class AdapterModelTesterMixin:
+    def test_training(self):
+        self.skipTest("Not applicable.")
+
+    def test_training_gradient_checkpointing(self):
+        self.skipTest("Not applicable.")
+
+    def test_correct_missing_keys(self):
+        self.skipTest("Not applicable.")
diff --git a/tests_adapters/test_distilbert.py b/tests_adapters/test_distilbert.py
new file mode 100644
index 0000000000..d2eb8f1375
--- /dev/null
+++ b/tests_adapters/test_distilbert.py
@@ -0,0 +1,12 @@
+from tests.test_modeling_distilbert import *
+from transformers import DistilBertAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class DistilBertAdapterModelTest(AdapterModelTesterMixin, DistilBertModelTest):
+    all_model_classes = (
+        DistilBertAdapterModel,
+    )
diff --git a/tests_adapters/test_encoder_decoder.py b/tests_adapters/test_encoder_decoder.py
new file mode 100644
index 0000000000..ff0dd8cbbf
--- /dev/null
+++ b/tests_adapters/test_encoder_decoder.py
@@ -0,0 +1 @@
+from tests.test_modeling_encoder_decoder import *
diff --git a/tests_adapters/test_gpt2.py b/tests_adapters/test_gpt2.py
new file mode 100644
index 0000000000..030499963a
--- /dev/null
+++ b/tests_adapters/test_gpt2.py
@@ -0,0 +1,12 @@
+from tests.test_modeling_gpt2 import *
+from transformers import GPT2AdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class GPT2AdapterModelTest(AdapterModelTesterMixin, GPT2ModelTest):
+    all_model_classes = (
+        GPT2AdapterModel,
+    )
diff --git a/tests_adapters/test_mbart.py b/tests_adapters/test_mbart.py
new file mode 100644
index 0000000000..e94e6f6309
--- /dev/null
+++ b/tests_adapters/test_mbart.py
@@ -0,0 +1,12 @@
+from tests.test_modeling_mbart import *
+from transformers import MBartAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class MBartAdapterModelTest(AdapterModelTesterMixin, MBartModelTest):
+    all_model_classes = (
+        MBartAdapterModel,
+    )
diff --git a/tests_adapters/test_roberta.py b/tests_adapters/test_roberta.py
new file mode 100644
index 0000000000..c779b6b18e
--- /dev/null
+++ b/tests_adapters/test_roberta.py
@@ -0,0 +1,12 @@
+from tests.test_modeling_roberta import *
+from transformers import RobertaAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class RobertaAdapterModelTest(AdapterModelTesterMixin, RobertaModelTest):
+    all_model_classes = (
+        RobertaAdapterModel,
+    )
diff --git a/tests_adapters/test_t5.py b/tests_adapters/test_t5.py
new file mode 100644
index 0000000000..8876dab61e
--- /dev/null
+++ b/tests_adapters/test_t5.py
@@ -0,0 +1,12 @@
+from tests.test_modeling_t5 import *
+from transformers import T5AdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class T5AdapterModelTest(AdapterModelTesterMixin, T5ModelTest):
+    all_model_classes = (
+        T5AdapterModel,
+    )
diff --git a/utils/run_tests.py b/utils/run_tests.py
deleted file mode 100644
index 24722e9d97..0000000000
--- a/utils/run_tests.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""
-Runs adapter tests and a subset of other tests relevant for adapter-transformers.
-"""
-import pytest
-
-
-TESTED_MODULES = [
-    "test_adapter",
-    "test_modeling_auto",
-    "test_modeling_bart",
-    "test_modeling_bert",
-    "test_modeling_distilbert",
-    "test_modeling_gpt2",
-    "test_modeling_mbart",
-    "test_modeling_roberta",
-    "test_modeling_xlm_roberta",
-    "test_modeling_encoder_decoder",
-    "test_modeling_t5",
-    "test_trainer",
-]
-
-
-if __name__ == "__main__":
-    test_selection = " or ".join(TESTED_MODULES)
-    args = [
-        "-k",
-        test_selection,
-        "--numprocesses=auto",
-        "--dist=loadfile",
-        "-s",
-        "-v",
-        "--ignore-glob=tests/test_tokenization*",
-        "--ignore-glob=tests/test_processor*",
-        "./tests",
-    ]
-    exit_code = pytest.main(args)
-    exit(exit_code)

From da3d4bedb97bae1d21f2e618228dfb110777e920 Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Wed, 16 Feb 2022 13:07:38 +0100
Subject: [PATCH 03/10] Replace `AutoModelWithHeads` w. `AutoAdapterModel` in
 tests

---
 tests_adapters/test_adapter_common.py        |  8 ++---
 tests_adapters/test_adapter_composition.py   | 14 ++++----
 tests_adapters/test_adapter_conversion.py    |  6 ++--
 tests_adapters/test_adapter_custom_head.py   | 12 +++----
 tests_adapters/test_adapter_embeddings.py    |  6 ++--
 tests_adapters/test_adapter_fusion_common.py |  4 +--
 tests_adapters/test_adapter_heads.py         | 38 ++++++++++----------
 tests_adapters/test_adapter_setup_context.py |  6 ++--
 tests_adapters/test_adapter_trainer.py       |  8 ++---
 tests_adapters/test_adapter_training.py      |  8 ++---
 10 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/tests_adapters/test_adapter_common.py b/tests_adapters/test_adapter_common.py
index 66047738a0..0a33d9231e 100644
--- a/tests_adapters/test_adapter_common.py
+++ b/tests_adapters/test_adapter_common.py
@@ -8,7 +8,7 @@
     ADAPTER_MODEL_MAPPING,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
     AdapterSetup,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     HoulsbyConfig,
     HoulsbyInvConfig,
     PfeifferConfig,
@@ -264,7 +264,7 @@ def test_loading_adapter_weights_with_prefix(self):
 
         model_base, model_with_head_base = create_twin_models(self.model_class, self.config)
 
-        model_with_head = AutoModelWithHeads.from_config(model_with_head_base.config)
+        model_with_head = AutoAdapterModel.from_config(model_with_head_base.config)
         setattr(model_with_head, model_with_head.base_model_prefix, model_with_head_base)
 
         model_with_head.add_adapter("dummy")
@@ -293,7 +293,7 @@ def test_loading_adapter_weights_without_prefix(self):
 
         model_base, model_with_head_base = create_twin_models(self.model_class, self.config)
 
-        model_with_head = AutoModelWithHeads.from_config(model_with_head_base.config)
+        model_with_head = AutoAdapterModel.from_config(model_with_head_base.config)
         setattr(model_with_head, model_with_head.base_model_prefix, model_with_head_base)
 
         model_base.add_adapter("dummy")
@@ -323,7 +323,7 @@ def test_forward_with_past(self):
             self.skipTest("No causal lm class.")
 
         static_model = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING[self.config_class](self.config())
-        flex_model = AutoModelWithHeads.from_pretrained(
+        flex_model = AutoAdapterModel.from_pretrained(
             None, config=self.config(), state_dict=static_model.state_dict()
         )
         static_model.add_adapter("dummy")
diff --git a/tests_adapters/test_adapter_composition.py b/tests_adapters/test_adapter_composition.py
index 163a8ed9aa..fd4fc0c874 100644
--- a/tests_adapters/test_adapter_composition.py
+++ b/tests_adapters/test_adapter_composition.py
@@ -7,7 +7,7 @@
 from tests.test_modeling_common import ids_tensor
 from transformers import (
     MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     AutoTokenizer,
     BertConfig,
     BertForSequenceClassification,
@@ -146,7 +146,7 @@ def test_batch_split_equivalent(self):
 @require_torch
 class ParallelAdapterInferenceTestMixin:
     def test_parallel_inference_with_heads(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
 
         model.add_adapter("a")
         model.add_adapter("b")
@@ -179,7 +179,7 @@ def test_parallel_inference_with_heads(self):
         self.assertTrue(torch.allclose(outputs[1][0], outputs_b[0], atol=1e-5))
 
     def test_parallel_inference_with_wrong_number_of_heads(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.eval()
 
         model.add_adapter("a")
@@ -199,7 +199,7 @@ def test_parallel_inference_with_wrong_number_of_heads(self):
             model(**inputs)
 
     def test_batch_split_with_heads(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_adapter("a")
         model.add_adapter("b")
         self.add_head(model, "a", num_labels=2)
@@ -280,7 +280,7 @@ def test_parallel_training(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
 
         model.add_adapter("mrpc1")
         model.add_adapter("mrpc2")
@@ -322,7 +322,7 @@ def test_parallel_training(self):
                 self.assertTrue(torch.equal(v1, v2))
 
     def test_parallel_training_equivalent_to_single_adapters(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.eval()
 
         a1, a2 = self.create_twin_adapters(model, "a")
@@ -362,7 +362,7 @@ def test_parallel_training_equivalent_to_single_adapters(self):
                 self.assertTrue(torch.allclose(v, state_dict[k.replace(b1, b2)], atol=1e-5))
 
     def test_parallel_training_single_forward_pass(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.eval()
 
         a1, a2 = self.create_twin_adapters(model, "a")
diff --git a/tests_adapters/test_adapter_conversion.py b/tests_adapters/test_adapter_conversion.py
index 7d37fffcba..f44a54fe31 100644
--- a/tests_adapters/test_adapter_conversion.py
+++ b/tests_adapters/test_adapter_conversion.py
@@ -12,7 +12,7 @@
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
     MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     BertPreTrainedModel,
     RobertaPreTrainedModel,
 )
@@ -26,7 +26,7 @@ class ModelClassConversionTestMixin:
     seq_length = 128
 
     def run_test(self, static_model, input_shape=None, label_dict=None):
-        flex_model = AutoModelWithHeads.from_pretrained(
+        flex_model = AutoAdapterModel.from_pretrained(
             None, config=self.config(), state_dict=static_model.state_dict()
         )
         static_model.eval()
@@ -147,7 +147,7 @@ def test_equivalent_language_generation(self):
             self.skipTest("no causal lm class.")
 
         static_model = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING[self.config_class](self.config())
-        flex_model = AutoModelWithHeads.from_pretrained(
+        flex_model = AutoAdapterModel.from_pretrained(
             None, config=self.config(), state_dict=static_model.state_dict()
         )
         static_model.add_adapter("dummy")
diff --git a/tests_adapters/test_adapter_custom_head.py b/tests_adapters/test_adapter_custom_head.py
index c3d9e703a3..50f7a9b927 100644
--- a/tests_adapters/test_adapter_custom_head.py
+++ b/tests_adapters/test_adapter_custom_head.py
@@ -4,7 +4,7 @@
 import torch
 
 from tests.test_modeling_common import ids_tensor
-from transformers import AutoConfig, AutoModelWithHeads
+from transformers import AutoConfig, AutoAdapterModel
 from transformers.adapters.heads import ClassificationHead, PredictionHead
 from transformers.testing_utils import require_torch, torch_device
 
@@ -30,7 +30,7 @@ def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=Fal
 class AdapterCustomHeadTest(unittest.TestCase):
     def test_add_custom_head(self):
         model_name = "bert-base-uncased"
-        model = AutoModelWithHeads.from_pretrained(model_name)
+        model = AutoAdapterModel.from_pretrained(model_name)
         model.register_custom_head("tag", CustomHead)
         config = {"num_labels": 3, "layers": 2, "activation_function": "tanh"}
         model.add_custom_head(head_type="tag", head_name="custom_head", **config)
@@ -46,7 +46,7 @@ def test_add_custom_head(self):
     def test_custom_head_from_model_config(self):
         model_name = "bert-base-uncased"
         model_config = AutoConfig.from_pretrained(model_name, custom_heads={"tag": CustomHead})
-        model = AutoModelWithHeads.from_pretrained(model_name, config=model_config)
+        model = AutoAdapterModel.from_pretrained(model_name, config=model_config)
         config = {"num_labels": 3, "layers": 2, "activation_function": "tanh"}
         model.add_custom_head(head_type="tag", head_name="custom_head", **config)
         model.eval()
@@ -61,8 +61,8 @@ def test_custom_head_from_model_config(self):
     def test_save_load_custom_head(self):
         model_name = "bert-base-uncased"
         model_config = AutoConfig.from_pretrained(model_name, custom_heads={"tag": CustomHead})
-        model1 = AutoModelWithHeads.from_pretrained(model_name, config=model_config)
-        model2 = AutoModelWithHeads.from_pretrained(model_name, config=model_config)
+        model1 = AutoAdapterModel.from_pretrained(model_name, config=model_config)
+        model2 = AutoAdapterModel.from_pretrained(model_name, config=model_config)
         config = {"num_labels": 3, "layers": 2, "activation_function": "tanh"}
         model1.add_custom_head(head_type="tag", head_name="custom_head", **config)
 
@@ -87,7 +87,7 @@ def test_save_load_custom_head(self):
     def test_builtin_head_as_custom(self):
         model_name = "bert-base-uncased"
         model_config = AutoConfig.from_pretrained(model_name, custom_heads={"tag": CustomHead})
-        model = AutoModelWithHeads.from_pretrained(model_name, config=model_config)
+        model = AutoAdapterModel.from_pretrained(model_name, config=model_config)
         model.eval()
         in_data = ids_tensor((1, 128), 1000)
 
diff --git a/tests_adapters/test_adapter_embeddings.py b/tests_adapters/test_adapter_embeddings.py
index b3e64dc532..c6ee786392 100644
--- a/tests_adapters/test_adapter_embeddings.py
+++ b/tests_adapters/test_adapter_embeddings.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from transformers import AutoModelWithHeads, AutoTokenizer, Trainer, TrainingArguments
+from transformers import AutoAdapterModel, AutoTokenizer, Trainer, TrainingArguments
 from transformers.testing_utils import require_torch, torch_device
 
 from .test_adapter_training import filter_parameters
@@ -71,7 +71,7 @@ def test_back_to_default(self):
         self.assertTrue(torch.equal(output1[0], output2[0]))
 
     def test_training_embedding(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
@@ -121,7 +121,7 @@ def test_training_embedding(self):
         )
 
     def test_reference_embedding(self):
-        model = AutoModelWithHeads.from_config(self.config())  # self.get_model()
+        model = AutoAdapterModel.from_config(self.config())  # self.get_model()
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
diff --git a/tests_adapters/test_adapter_fusion_common.py b/tests_adapters/test_adapter_fusion_common.py
index 10fc560c1f..28893a0358 100644
--- a/tests_adapters/test_adapter_fusion_common.py
+++ b/tests_adapters/test_adapter_fusion_common.py
@@ -9,7 +9,7 @@
     ADAPTER_MODEL_MAPPING,
     ADAPTERFUSION_CONFIG_MAP,
     AdapterConfig,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     PfeifferConfig,
 )
 from transformers.adapters.composition import Fuse
@@ -165,7 +165,7 @@ def test_model_config_serialization_fusion(self):
     def test_adapter_fusion_save_with_head(self):
         if self.config_class not in ADAPTER_MODEL_MAPPING:
             self.skipTest("Does not support flex heads.")
-        model1 = AutoModelWithHeads.from_config(self.config())
+        model1 = AutoAdapterModel.from_config(self.config())
         model1.eval()
 
         name1 = "name1"
diff --git a/tests_adapters/test_adapter_heads.py b/tests_adapters/test_adapter_heads.py
index 1e7b1e77f9..ae1c34d1e1 100644
--- a/tests_adapters/test_adapter_heads.py
+++ b/tests_adapters/test_adapter_heads.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from transformers import ADAPTER_MODEL_MAPPING, AdapterSetup, AutoModelForSequenceClassification, AutoModelWithHeads
+from transformers import ADAPTER_MODEL_MAPPING, AdapterSetup, AutoModelForSequenceClassification, AutoAdapterModel
 from transformers.adapters.composition import BatchSplit, Stack
 from transformers.testing_utils import require_torch, torch_device
 
@@ -55,7 +55,7 @@ def test_classification_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_classification_head("dummy")
         label_dict = {}
@@ -66,7 +66,7 @@ def test_multiple_choice_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_multiple_choice_head"):
             self.skipTest("No multiple choice head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_multiple_choice_head("dummy")
         label_dict = {}
@@ -79,7 +79,7 @@ def test_tagging_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_tagging_head"):
             self.skipTest("No tagging head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_tagging_head("dummy")
         label_dict = {}
@@ -92,7 +92,7 @@ def test_qa_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_qa_head"):
             self.skipTest("No QA head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_qa_head("dummy")
         label_dict = {}
@@ -106,7 +106,7 @@ def test_causal_lm_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_causal_lm_head"):
             self.skipTest("No causal language model head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
         model1.add_causal_lm_head("dummy")
 
         label_dict = {}
@@ -124,7 +124,7 @@ def test_seq2seq_lm_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_seq2seq_lm_head"):
             self.skipTest("No seq2seq language model head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
         model1.add_seq2seq_lm_head("dummy")
 
         label_dict = {}
@@ -155,7 +155,7 @@ def test_masked_lm_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_masked_lm_head"):
             self.skipTest("No causal or seq2seq language model head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_masked_lm_head("dummy")
         label_dict = {}
@@ -172,7 +172,7 @@ def test_dependency_parsing_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_dependency_parsing_head"):
             self.skipTest("No dependency parsing head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_dependency_parsing_head("dummy")
         label_dict = {}
@@ -190,7 +190,7 @@ def test_dependency_parsing_head(self):
         )
 
     def test_delete_head(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.eval()
 
         name = "test_head"
@@ -207,7 +207,7 @@ def test_delete_head(self):
     def test_adapter_with_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         name = "dummy"
         model1.add_adapter(name)
@@ -231,7 +231,7 @@ def test_adapter_with_head(self):
     def test_adapter_with_head_load_as(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         name = "dummy"
         model1.add_adapter(name)
@@ -255,7 +255,7 @@ def test_adapter_with_head_load_as(self):
         self.assertEqual(3, output1[0].size()[1])
 
     def test_load_full_model(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_classification_head("dummy", layers=1)
 
         true_config = model.get_prediction_heads_config()
@@ -263,14 +263,14 @@ def test_load_full_model(self):
             # save
             model.save_pretrained(temp_dir)
             # reload
-            model = AutoModelWithHeads.from_pretrained(temp_dir)
+            model = AutoAdapterModel.from_pretrained(temp_dir)
         self.assertIn("dummy", model.heads)
         self.assertDictEqual(true_config, model.get_prediction_heads_config())
 
     def test_batch_split_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_classification_head("a")
         model.add_classification_head("b")
         model.active_head = BatchSplit("a", "b", batch_sizes=[1, 2])
@@ -284,7 +284,7 @@ def test_batch_split_head(self):
         self.assertEqual((2, 2), out[1][0].shape)
 
     def test_batch_split_adapter_head(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         self.add_head(model, "a")
         self.add_head(model, "b")
         model.add_adapter("a")
@@ -303,7 +303,7 @@ def test_reload_static_to_flex_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
         static_head_model = AutoModelForSequenceClassification.from_config(self.config())
-        flex_head_model = AutoModelWithHeads.from_pretrained(
+        flex_head_model = AutoAdapterModel.from_pretrained(
             None, config=self.config(), state_dict=static_head_model.state_dict()
         )
         static_head_model.eval()
@@ -345,7 +345,7 @@ def test_invertible_adapter_with_head(self):
         else:
             self.skipTest("No masked or causel language model head")
 
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_adapter("test", config="pfeiffer+inv")
         if lm_head == "casual_lm":
             model.add_causal_lm_head("test")
@@ -379,7 +379,7 @@ def forward_pre_hook(module, input):
     def test_context_simple(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_adapter("a")
         model.add_classification_head("a", num_labels=3)
         # Make sure no adapter is activated
diff --git a/tests_adapters/test_adapter_setup_context.py b/tests_adapters/test_adapter_setup_context.py
index f9a00ff7ab..6d96535bd2 100644
--- a/tests_adapters/test_adapter_setup_context.py
+++ b/tests_adapters/test_adapter_setup_context.py
@@ -2,7 +2,7 @@
 from threading import Thread
 
 from tests.test_modeling_common import ids_tensor
-from transformers import AdapterSetup, AutoModelWithHeads, BertConfig
+from transformers import AdapterSetup, AutoAdapterModel, BertConfig
 from transformers.testing_utils import require_torch, torch_device
 
 
@@ -17,7 +17,7 @@ def setUp(self):
         )
 
     def test_context_nested(self):
-        model = AutoModelWithHeads.from_config(self.config)
+        model = AutoAdapterModel.from_config(self.config)
         model.add_adapter("a")
         model.add_classification_head("a", num_labels=2)
         model.add_adapter("b")
@@ -57,7 +57,7 @@ def forward_pre_hook_b(module, input):
         self.assertEqual(calls_b, 1)
 
     def test_context_multi_threading(self):
-        model = AutoModelWithHeads.from_config(self.config)
+        model = AutoAdapterModel.from_config(self.config)
         model.add_adapter("a")
         model.add_classification_head("a", num_labels=2)
         model.add_adapter("b")
diff --git a/tests_adapters/test_adapter_trainer.py b/tests_adapters/test_adapter_trainer.py
index 613522163a..8f2e1b6f6c 100644
--- a/tests_adapters/test_adapter_trainer.py
+++ b/tests_adapters/test_adapter_trainer.py
@@ -6,7 +6,7 @@
 
 from transformers import (
     AutoModelForSequenceClassification,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     AutoTokenizer,
     BertConfig,
     BertForSequenceClassification,
@@ -221,7 +221,7 @@ def test_reloading_prediction_head(self):
         )
         train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
 
-        model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
+        model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 
         model.add_classification_head("adapter", num_labels=3)
         model.add_classification_head("dummy", num_labels=2)
@@ -255,7 +255,7 @@ def test_reloading_prediction_head(self):
 
             trainer.train()
             # create second model that should resume the training of the first
-            model_resume = AutoModelWithHeads.from_pretrained("bert-base-uncased")
+            model_resume = AutoAdapterModel.from_pretrained("bert-base-uncased")
 
             model_resume.add_classification_head("adapter", num_labels=3)
             model_resume.add_classification_head("dummy", num_labels=2)
@@ -290,7 +290,7 @@ def test_general(self):
         )
         train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
 
-        model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
+        model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 
         model.add_classification_head("task", num_labels=3)
 
diff --git a/tests_adapters/test_adapter_training.py b/tests_adapters/test_adapter_training.py
index 4957184ee0..66e6db441d 100644
--- a/tests_adapters/test_adapter_training.py
+++ b/tests_adapters/test_adapter_training.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from transformers import AutoModelWithHeads, AutoTokenizer, TrainingArguments
+from transformers import AutoAdapterModel, AutoTokenizer, TrainingArguments
 from transformers.adapters.composition import BatchSplit, Fuse
 from transformers.adapters.trainer import AdapterTrainer as Trainer
 from transformers.testing_utils import require_torch
@@ -38,7 +38,7 @@ def test_train_single_adapter(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
 
         # add two adapters: one will be trained and the other should be frozen
         model.add_adapter("mrpc")
@@ -76,7 +76,7 @@ def test_train_adapter_fusion(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         self.add_head(model, "head")
 
         # add the adapters to be fused
@@ -138,7 +138,7 @@ def test_batch_split_training(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
 
         model.add_adapter("mrpc1")
         model.add_adapter("mrpc2")

From 4046aba4a5134a1a02af03fcb4d874171b8d7a01 Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Wed, 16 Feb 2022 14:00:44 +0100
Subject: [PATCH 04/10] Bug fixes

---
 src/transformers/adapters/models/roberta.py         | 11 +++++++++++
 tests_adapters/extended/test_adapter_trainer_ext.py |  2 +-
 tests_adapters/test_adapter_composition.py          |  8 ++++----
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/transformers/adapters/models/roberta.py b/src/transformers/adapters/models/roberta.py
index 514a9d9a4e..4bcad514de 100644
--- a/src/transformers/adapters/models/roberta.py
+++ b/src/transformers/adapters/models/roberta.py
@@ -107,6 +107,17 @@ def forward(
             # in case no head is used just return the output of the base model (including pooler output)
             return outputs
 
+    head_types = {
+        "classification": ClassificationHead,
+        "multilabel_classification": MultiLabelClassificationHead,
+        "tagging": TaggingHead,
+        "multiple_choice": MultipleChoiceHead,
+        "question_answering": QuestionAnsweringHead,
+        "dependency_parsing": BiaffineParsingHead,
+        "masked_lm": BertStyleMaskedLMHead,
+        "causal_lm": CausalLMHead,
+    }
+
     def add_classification_head(
         self,
         head_name,
diff --git a/tests_adapters/extended/test_adapter_trainer_ext.py b/tests_adapters/extended/test_adapter_trainer_ext.py
index 597f56fbb6..4ef24281a0 100644
--- a/tests_adapters/extended/test_adapter_trainer_ext.py
+++ b/tests_adapters/extended/test_adapter_trainer_ext.py
@@ -230,7 +230,7 @@ def run_trainer(
         do_eval: bool = True,
         do_predict: bool = True,
     ):
-        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
+        data_dir = self.test_file_dir / "../../tests/fixtures/tests_samples/wmt_en_ro"
         output_dir = self.get_auto_remove_tmp_dir()
         args_train = f"""
             --model_name_or_path {model_name}
diff --git a/tests_adapters/test_adapter_composition.py b/tests_adapters/test_adapter_composition.py
index fd4fc0c874..ca24bc3213 100644
--- a/tests_adapters/test_adapter_composition.py
+++ b/tests_adapters/test_adapter_composition.py
@@ -11,7 +11,7 @@
     AutoTokenizer,
     BertConfig,
     BertForSequenceClassification,
-    T5ModelWithHeads,
+    T5AdapterModel,
     Trainer,
     TrainingArguments,
 )
@@ -208,7 +208,7 @@ def test_batch_split_with_heads(self):
         model.to(torch_device)
 
         inputs = {"input_ids": self.get_input_samples((2, 128), config=model.config)["input_ids"]}
-        if isinstance(model, T5ModelWithHeads):
+        if isinstance(model, T5AdapterModel):
             inputs["decoder_input_ids"] = inputs["input_ids"]
 
         # for reference, pass through single adapters
@@ -331,7 +331,7 @@ def test_parallel_training_equivalent_to_single_adapters(self):
         dataset = []
         for i in range(3):
             input_data = self.get_input_samples((3, 128), config=model.config)
-            if isinstance(model, T5ModelWithHeads):
+            if isinstance(model, T5AdapterModel):
                 input_data["labels"] = torch.randint(0, 2, (3, 128))
             else:
                 input_data["labels"] = torch.randint(0, 2, (3, 1))
@@ -376,7 +376,7 @@ def test_parallel_training_single_forward_pass(self):
                 self.assertTrue(torch.equal(v, state_dict[k.replace(b1, b2)]))
 
         input_data = self.get_input_samples((3, 128), config=model.config)
-        if isinstance(model, T5ModelWithHeads):
+        if isinstance(model, T5AdapterModel):
             input_data["labels"] = torch.randint(0, 2, (3, 128), device=torch_device)
         else:
             input_data["labels"] = torch.randint(0, 2, (3, 1), device=torch_device)

From 7c6a1f5f2599706dfe94757238144f7cd39ed545 Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Wed, 16 Feb 2022 15:20:46 +0100
Subject: [PATCH 05/10] Add model overview doc

---
 README.md                            |  3 +++
 adapter_docs/classes/models/auto.rst | 11 +++++++++++
 adapter_docs/conf.py                 |  1 +
 adapter_docs/index.rst               |  3 ++-
 adapter_docs/model_overview.md       | 29 +++++++++++++++++++++++++++-
 5 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 adapter_docs/classes/models/auto.rst

diff --git a/README.md b/README.md
index 1127547a20..b918cc3afd 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,9 @@ To get started with adapters, refer to these locations:
 - **https://adapterhub.ml** to explore available pre-trained adapter modules and share your own adapters
 - **[Examples folder](https://github.com/Adapter-Hub/adapter-transformers/tree/master/examples)** of this repository containing HuggingFace's example training scripts, many adapted for training adapters
 
+## Supported Models
+
+We currently support the PyTorch versions of all models listed on the **[Model Overview](https://docs.adapterhub.ml/model_overview.html) page** in our documentation.
 
 ## Citation
 
diff --git a/adapter_docs/classes/models/auto.rst b/adapter_docs/classes/models/auto.rst
new file mode 100644
index 0000000000..ae7fec236b
--- /dev/null
+++ b/adapter_docs/classes/models/auto.rst
@@ -0,0 +1,11 @@
+Auto Classes
+============
+
+Similar to the ``AutoModel`` classes built-in into HuggingFace Transformers, adapter-transformers provides an ``AutoAdapterModel`` class.
+As with other auto classes, the correct adapter model class is automatically instantiated based on the pre-trained model passed to the ``from_pretrained()`` method.
+
+AutoAdapterModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.adapters.AutoAdapterModel
+    :members:
diff --git a/adapter_docs/conf.py b/adapter_docs/conf.py
index 85543b34ba..e021e83615 100644
--- a/adapter_docs/conf.py
+++ b/adapter_docs/conf.py
@@ -42,6 +42,7 @@
     "sphinx.ext.napoleon",
     "sphinx_copybutton",
     "sphinx_multiversion",
+    "sphinx_markdown_tables",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/adapter_docs/index.rst b/adapter_docs/index.rst
index 715a0cd92b..46c1976e64 100644
--- a/adapter_docs/index.rst
+++ b/adapter_docs/index.rst
@@ -19,7 +19,7 @@ The *adapter-transformers* section documents the integration of adapters into th
 
 The section on *Adapter-Hub* describes the fundamentals of the pre-trained adapter repository and how to contribute new adapters.
 
-Currently, we support the PyTorch versions of all models listed in the *Supported Models* section.
+Currently, we support the PyTorch versions of all models as listed on the `Model Overview <model_overview.html>`_ page.
 
 .. toctree::
    :maxdepth: 2
@@ -48,6 +48,7 @@ Currently, we support the PyTorch versions of all models listed in the *Supporte
    :caption: Supported Models
 
    model_overview
+   classes/models/auto
    classes/models/bart
    classes/models/bert
    classes/models/distilbert
diff --git a/adapter_docs/model_overview.md b/adapter_docs/model_overview.md
index 31738d1950..e299ff0941 100644
--- a/adapter_docs/model_overview.md
+++ b/adapter_docs/model_overview.md
@@ -1,3 +1,30 @@
 # Model Overview
 
-TODO-AH
+This page gives an overview of the Transformer models currently supported by `adapter-transformers`.
+The table below further shows which model architectures support which adaptation methods and which features of `adapter-transformers`.
+
+```eval_rst
+.. note::
+    Each supported model architecture X typically provides a class ``XAdapterModel`` for usage with ``AutoAdapterModel``.
+    Additionally, it is possible to use adapters with the model classes already shipped with HuggingFace Transformers.
+    E.g., for BERT, this means adapter-transformers provides a ``BertAdapterModel`` class, but you can also use ``BertModel``, ``BertForSequenceClassification`` etc. together with adapters.
+```
+
+| Model                                   | (Bottleneck)<br> Adapters | Prefix<br> Tuning | Compacter | Adapter<br> Fusion | Invertible<br> Adapters | Parallel<br> block |
+| --------------------------------------- | -| - | - | - | - | - |
+| [BART](classes/models/bart.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [BERT](classes/models/bert.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | (*) |
+| [GPT-2](classes/models/gpt2.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [MBart](classes/models/mbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [RoBERTa](classes/models/roberta.html)  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [T5](classes/models/t5.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [XLM-RoBERTa](classes/models/xlmroberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+(*) Depending on the used encoder and decoder model class.
+
+**Missing a model architecture you'd like to use?**
+adapter-transformers can be easily extended to new model architectures as described in [Adding Adapters to a Model](https://github.com/Adapter-Hub/adapter-transformers/blob/master/adding_adapters_to_a_model.md).
+Feel free to [open an issue](https://github.com/Adapter-Hub/adapter-transformers/issues) requesting support for a new architecture.
+_We very much welcome pull requests adding new model implementations!_

From 692f8fb9c10a1f437853ebd4d07d16c73e9519c4 Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Thu, 17 Feb 2022 22:47:11 +0100
Subject: [PATCH 06/10] More refactoring to adapter test classes. Upgrade model
 adding guide.

---
 adapter_docs/model_overview.md         |   2 +-
 adding_adapters_to_a_model.md          |  45 +--
 tests_adapters/test_adapter.py         | 395 +------------------------
 tests_adapters/test_bart.py            |  49 +++
 tests_adapters/test_bert.py            |  46 +++
 tests_adapters/test_distilbert.py      |  46 +++
 tests_adapters/test_encoder_decoder.py |  66 ++++-
 tests_adapters/test_gpt2.py            |  47 +++
 tests_adapters/test_mbart.py           |  43 +++
 tests_adapters/test_roberta.py         |  40 +++
 tests_adapters/test_t5.py              |  90 ++++++
 tests_adapters/test_xlm_roberta.py     |  24 ++
 12 files changed, 480 insertions(+), 413 deletions(-)
 create mode 100644 tests_adapters/test_xlm_roberta.py

diff --git a/adapter_docs/model_overview.md b/adapter_docs/model_overview.md
index e299ff0941..dd57df7af6 100644
--- a/adapter_docs/model_overview.md
+++ b/adapter_docs/model_overview.md
@@ -15,7 +15,7 @@ The table below further shows which model architectures support which adaptation
 | [BART](classes/models/bart.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [BERT](classes/models/bert.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | (*) |
+| [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | |
 | [GPT-2](classes/models/gpt2.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [MBart](classes/models/mbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [RoBERTa](classes/models/roberta.html)  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
diff --git a/adding_adapters_to_a_model.md b/adding_adapters_to_a_model.md
index a9f35d26fe..a2367ca0f8 100644
--- a/adding_adapters_to_a_model.md
+++ b/adding_adapters_to_a_model.md
@@ -14,31 +14,39 @@ Now we go through the integration of adapters into an existing model architectur
 
 ## Implementation
 
-❓ Each model architecture with adapter support has a main `<model_type>.py` module in `src/transformers/adapters/models` (e.g. `src/transformers/adapters/models/distilbert.py` for `modeling_distilbert.py`) that provides the required adapter mixins for each modeling component (e.g. there is a `DistilBertTransfomerBlockAdaptersMixin` for the `TransformerBlock` of DistilBERT etc.).
-This is the central module to implement.
+### Integration into model implementation
+
+❓ Adding adapter support to an existing model architecture requires modifying a few parts of the model forward pass logic. These changes have to be made directly in the respective `modeling_<model_type>.py` class.
+Additionally, a few adapter mixins need to be applied to the respective Transformer module classes to provide the adapter implementations to a model.
+For this purpose, there typically exists a module `src/transformers/adapters/mixins/<model_type>.py`.
 
 **📝 Steps**
 
-- Add a new `<model_type>.py` module for your architecture in `src/transformers/adapters/models` (or reuse an existing if possible).
-    - There usually should be one mixin that derives from `AdapterLayerBaseMixin` or has it as a child module.
-    - The mixin for the whole base model class (e.g. `BertModel`) should derive from `ModelAdaptersMixin` and (if possible) `InvertibleAdaptersMixin`. Make sure to implement the abstract methods these mixins might define.
+- Add a new `<model_type>.py` module for your architecture in `src/transformers/adapters/mixins` (or reuse an existing if possible).
+    - There usually exists a mixin on the Transformer layer level that derives that holds modules for adapter layers.
+    - The mixin for the whole base model class (e.g. `BertModel`) should derive from `ModelAdaptersMixin` and (if possible) `InvertibleAdaptersMixin`. This mixin should at least implement the `iter_layers()` method but might require additional modifications depending on the architecture.
     - Have a look at existing examples, e.g. `distilbert.py`, `bert.py`.
-- Implement the mixins on the modeling classes (`modeling_<model_type>.py`).
-    - Make sure the calls to `adapters_forward()` are added in the right places.
+- Implement the mixins and the required modifications on the modeling classes (`modeling_<model_type>.py`).
+    - Make sure the calls to `adapter_layer_forward()` are added in the right places.
     - The base model class (e.g. `BertModel`) should implement the mixin derived from `ModelAdaptersMixin` you created previously.
     - The model classes with heads (e.g. `BertForSequenceClassification`) should directly implement `ModelWithHeadsAdaptersMixin`.
+    - To additionally support Prefix Tuning, it's necessary to apply the forward call to the `PrefixTuningShim` module in the respective attention layer.
+    - Again, have a look at existing implementations, e.g. `modeling_distilbert.py` or `modeling_bart.py`.
 - Add the mixin for config classes, `ModelConfigAdaptersMixin`, to the model configuration class in `configuration_<model_type>`.
     - There are some naming differences on the config attributes of different model architectures. The adapter implementation requires some additional attributes with a specific name to be available. These currently are `hidden_dropout_prob` and `attention_probs_dropout_prob` as in the `BertConfig` class.
 
-❓ Adapter-supporting architectures have a new model class `<model_type>ModelWithHeads`.
-These classes allow flexible adding of and switching between multiple prediction heads of different types.
+### `...AdapterModel` class
+
+❓ Adapter-supporting architectures should provide a new model class `<model_type>AdapterModel`.
+This class allows flexible adding of and switching between multiple prediction heads of different types.
 
 **📝 Steps**
 
-- In `modeling_<model_type>.py`, add a new `<model_type>ModelWithHeads` class.
-    - This class should implement a mixin (in `src/transformers/adapters/models/<model_type>.py`) which derives from `ModelWithFlexibleHeadsAdaptersMixin`
-    - In the mixin, add methods for those prediction heads that make sense for the new model architecture.
-- Add `<model_type>ModelWithHeads` to the `MODEL_WITH_HEADS_MAPPING` mapping in `modeling_auto.py` and to `__init__.py`.
+- In `src/transformers/adapters/models`, add a new `<model_type>.py` file.
+    - This module should implement the `<model_type>AdapterModel` class, deriving from `ModelWithFlexibleHeadsAdaptersMixin` and `<model_type>PreTrainedModel`.
+    - In the model class, add methods for those prediction heads that make sense for the new model architecture.
+    - Again, have a look at existing implementations, e.g. `bert.py`. Note that the `<model_type>ModelWithHeads` classes in existing modules are kept for backwards compatibility and are not needed for newly added architectures.
+- Add `<model_type>AdapterModel` to the `ADAPTER_MODEL_MAPPING_NAMES` mapping in `src/transformers/adapters/models/auto.py` and to `src/transformers/adapters/__init__.py`.
 
 ### Additional (optional) implementation steps
 
@@ -47,13 +55,14 @@ These classes allow flexible adding of and switching between multiple prediction
 
 ## Testing
 
-❓ In addition to the general HuggingFace model tests, there are adapter-specific test cases (usually starting with `test_adapter_`).
+❓ In addition to the general HuggingFace model tests, there are adapter-specific test cases. All tests are executed from the `tests_adapters` folder.
 
 **📝 Steps**
 
-- Add a new `<model_type>AdapterTest` class in `test_adapter.py` similar to the existing classes (e.g. `BertAdapterTest`).
-- Add `<model_type>ModelWithHeads` to `test_modeling_<model_type>.py`.
-- Insert `test_modeling_<model_type>` into the list of tested modules in `utils/run_tests.py`.
+- Add a new `test_<model_type>.py` module in `tests_adapters`. This module typically holds three test classes:
+    - `<model_type>AdapterModelTest` derives directly from HuggingFace's existing model test class `<model_type>ModelTest` and adds `<model_type>AdapterModel` as class to test.
+    - `<model_type>AdapterModelTest` derives from a collection of test mixins that hold various adapter tests (depending on the implementation).
+    - (optionally) `<model_type>ClassConversionTest` runs tests for correct class conversion if conversion of prediction heads is implemented.
 - Append `<model_type>` to the list in `check_adapters.py`.
 
 ## Documentation
@@ -62,7 +71,7 @@ These classes allow flexible adding of and switching between multiple prediction
 
 **📝 Steps**
 
-- Add `adapter_docs/classes/models/<model_type>.rst` (oriented at the doc file in the HF docs, make sure to include `<model_type>ModelWithHeads` and the HF notice). 
+- Add `adapter_docs/classes/models/<model_type>.rst` (oriented at the doc file in the HF docs). Make sure to include `<model_type>AdapterModel` autodoc. 
 Finally, list the file in `index.rst`.
 
 ## Training Example Adapters
diff --git a/tests_adapters/test_adapter.py b/tests_adapters/test_adapter.py
index a6ed413303..c381f86ba9 100644
--- a/tests_adapters/test_adapter.py
+++ b/tests_adapters/test_adapter.py
@@ -1,34 +1,9 @@
 import random
-import unittest
 
 import torch
-from datasets import load_dataset
 
-from transformers import (
-    AutoModel,
-    AutoModelForSeq2SeqLM,
-    BartConfig,
-    BertConfig,
-    DistilBertConfig,
-    EncoderDecoderConfig,
-    EncoderDecoderModel,
-    GlueDataset,
-    GlueDataTrainingArguments,
-    GPT2Config,
-    MBartConfig,
-    RobertaConfig,
-    T5Config,
-    XLMRobertaConfig,
-)
-from transformers.testing_utils import require_torch, torch_device
-
-from .test_adapter_common import AdapterModelTestMixin
-from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
-from .test_adapter_conversion import ModelClassConversionTestMixin
-from .test_adapter_embeddings import EmbeddingTestMixin
-from .test_adapter_fusion_common import AdapterFusionModelTestMixin
-from .test_adapter_heads import PredictionHeadModelTestMixin
-from .test_adapter_training import AdapterTrainingTestMixin
+from transformers import AutoModel, GlueDataset, GlueDataTrainingArguments
+from transformers.testing_utils import torch_device
 
 
 def make_config(config_class, **kwargs):
@@ -74,369 +49,3 @@ def dataset(self, tokenizer):
             task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
         )
         return GlueDataset(data_args, tokenizer=tokenizer, mode="train")
-
-
-class BertAdapterTestBase(AdapterTestBase):
-    config_class = BertConfig
-    config = make_config(
-        BertConfig,
-        hidden_size=32,
-        num_hidden_layers=4,
-        num_attention_heads=4,
-        intermediate_size=37,
-    )
-    tokenizer_name = "bert-base-uncased"
-
-
-@require_torch
-class BertAdapterTest(
-    EmbeddingTestMixin,
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTrainingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    BertAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class BertClassConversionTest(
-    ModelClassConversionTestMixin,
-    BertAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class RobertaAdapterTestBase(AdapterTestBase):
-    config_class = RobertaConfig
-    config = make_config(
-        RobertaConfig,
-        hidden_size=32,
-        num_hidden_layers=4,
-        num_attention_heads=4,
-        intermediate_size=37,
-    )
-
-
-@require_torch
-class RobertaAdapterTest(
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    RobertaAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class RobertaClassConversionTest(
-    ModelClassConversionTestMixin,
-    RobertaAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class XLMRobertaClassConversionTest(
-    ModelClassConversionTestMixin,
-    AdapterTestBase,
-    unittest.TestCase,
-):
-    config_class = XLMRobertaConfig
-    config = make_config(
-        XLMRobertaConfig,
-        hidden_size=32,
-        num_hidden_layers=4,
-        num_attention_heads=4,
-        intermediate_size=37,
-    )
-
-
-class DistilBertAdapterTestBase(AdapterTestBase):
-    config_class = DistilBertConfig
-    config = make_config(
-        DistilBertConfig,
-        dim=32,
-        n_layers=4,
-        n_heads=4,
-        hidden_dim=37,
-    )
-    tokenizer_name = "distilbert-base-uncased"
-
-
-@require_torch
-class DistilBertAdapterTest(
-    AdapterModelTestMixin,
-    EmbeddingTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTrainingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    DistilBertAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class DistilBertClassConversionTest(
-    ModelClassConversionTestMixin,
-    DistilBertAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class BartAdapterTestBase(AdapterTestBase):
-    config_class = BartConfig
-    config = make_config(
-        BartConfig,
-        d_model=16,
-        encoder_layers=2,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        encoder_ffn_dim=4,
-        decoder_ffn_dim=4,
-    )
-    tokenizer_name = "facebook/bart-base"
-
-
-@require_torch
-class BartAdapterTest(
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    EmbeddingTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTrainingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    BartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class BartClassConversionTest(
-    ModelClassConversionTestMixin,
-    BartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class MBartAdapterTestBase(AdapterTestBase):
-    config_class = MBartConfig
-    config = make_config(
-        MBartConfig,
-        d_model=16,
-        encoder_layers=2,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        encoder_ffn_dim=4,
-        decoder_ffn_dim=4,
-    )
-
-
-@require_torch
-class MBartAdapterTest(
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    MBartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class MBartClassConversionTest(
-    ModelClassConversionTestMixin,
-    MBartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class GPT2AdapterTestBase(AdapterTestBase):
-    config_class = GPT2Config
-    config = make_config(
-        GPT2Config,
-        n_embd=32,
-        n_layer=4,
-        n_head=4,
-        # set pad token to eos token
-        pad_token_id=50256,
-    )
-    tokenizer_name = "gpt2"
-
-
-@require_torch
-class GPT2AdapterTest(
-    AdapterModelTestMixin,
-    EmbeddingTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTrainingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    GPT2AdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class GPT2ClassConversionTest(
-    ModelClassConversionTestMixin,
-    GPT2AdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class EncoderDecoderAdapterTestBase(AdapterTestBase):
-    model_class = EncoderDecoderModel
-    config_class = EncoderDecoderConfig
-    config = staticmethod(
-        lambda: EncoderDecoderConfig.from_encoder_decoder_configs(
-            BertConfig(
-                hidden_size=32,
-                num_hidden_layers=4,
-                num_attention_heads=4,
-                intermediate_size=37,
-            ),
-            BertConfig(
-                hidden_size=32,
-                num_hidden_layers=4,
-                num_attention_heads=4,
-                intermediate_size=37,
-                is_decoder=True,
-                add_cross_attention=True,
-            ),
-        )
-    )
-    tokenizer_name = "bert-base-uncased"
-
-
-@require_torch
-class EncoderDecoderAdapterTest(
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    EncoderDecoderAdapterTestBase,
-    unittest.TestCase,
-):
-    def test_invertible_adapter_with_head(self):
-        """This test class is copied and adapted from the identically-named test in test_adapter_heads.py."""
-        model = AutoModelForSeq2SeqLM.from_config(self.config())
-        model.add_adapter("test", config="pfeiffer+inv")
-        model.set_active_adapters("test")
-
-        # Set a hook before the invertible adapter to make sure it's actually called twice:
-        # Once after the embedding layer and once in the prediction head.
-        calls = 0
-
-        def forward_pre_hook(module, input):
-            nonlocal calls
-            calls += 1
-
-        inv_adapter = model.base_model.get_invertible_adapter()
-        self.assertIsNotNone(inv_adapter)
-        inv_adapter.register_forward_pre_hook(forward_pre_hook)
-
-        in_data = self.get_input_samples((1, 128), config=model.config)
-        model.to(torch_device)
-        out = model(**in_data)
-
-        self.assertEqual((1, 128, model.config.decoder.vocab_size), out[0].shape)
-        self.assertEqual(2, calls)
-
-
-@require_torch
-class T5AdapterTestBase(AdapterTestBase):
-    config_class = T5Config
-    config = make_config(
-        T5Config,
-        d_model=16,
-        num_layers=2,
-        num_decoder_layers=2,
-        num_heads=4,
-        d_ff=4,
-        d_kv=16 // 4,
-        tie_word_embeddings=False,
-        decoder_start_token_id=0,
-    )
-    tokenizer_name = "t5-base"
-
-    def add_head(self, model, name, **kwargs):
-        model.add_seq2seq_lm_head(name)
-
-    def dataset(self, tokenizer):
-        def preprocess_function(examples):
-            inputs = examples["document"]
-            targets = examples["summary"]
-            inputs = ["Summarize: " + inp for inp in inputs]
-            model_inputs = tokenizer(inputs, padding=True, truncation=True)
-
-            # Setup the tokenizer for targets
-            with tokenizer.as_target_tokenizer():
-                labels = tokenizer(targets, padding=True, truncation=True)
-
-            # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-            # padding in the loss.
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-            model_inputs["labels"] = labels["input_ids"]
-            return model_inputs
-
-        data_args = {
-            "task_name": "xsum",
-            "path": "./tests/fixtures/tests_samples/xsum/sample.json",
-        }
-        dataset = load_dataset("json", data_files=data_args["path"])
-        train_dataset = dataset["train"]
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            desc="Running tokenizer on train dataset",
-        )
-        return train_dataset
-
-
-@require_torch
-class T5AdapterTest(
-    T5AdapterTestBase,
-    EmbeddingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    AdapterTrainingTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class T5ClassConversionTest(
-    ModelClassConversionTestMixin,
-    T5AdapterTestBase,
-    unittest.TestCase,
-):
-    pass
diff --git a/tests_adapters/test_bart.py b/tests_adapters/test_bart.py
index 5d34ec9c6f..9990b914e2 100644
--- a/tests_adapters/test_bart.py
+++ b/tests_adapters/test_bart.py
@@ -1,7 +1,17 @@
+import unittest
+
 from tests.test_modeling_bart import *
 from transformers import BartAdapterModel
 from transformers.testing_utils import require_torch
 
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
 from .test_common import AdapterModelTesterMixin
 
 
@@ -10,3 +20,42 @@ class BartAdapterModelTest(AdapterModelTesterMixin, BartModelTest):
     all_model_classes = (
         BartAdapterModel,
     )
+
+
+class BartAdapterTestBase(AdapterTestBase):
+    config_class = BartConfig
+    config = make_config(
+        BartConfig,
+        d_model=16,
+        encoder_layers=2,
+        decoder_layers=2,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        encoder_ffn_dim=4,
+        decoder_ffn_dim=4,
+    )
+    tokenizer_name = "facebook/bart-base"
+
+
+@require_torch
+class BartAdapterTest(
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    EmbeddingTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTrainingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    BartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class BartClassConversionTest(
+    ModelClassConversionTestMixin,
+    BartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_bert.py b/tests_adapters/test_bert.py
index 2eef1d7a09..57d0d54ac1 100644
--- a/tests_adapters/test_bert.py
+++ b/tests_adapters/test_bert.py
@@ -1,7 +1,17 @@
+import unittest
+
 from tests.test_modeling_bert import *
 from transformers import BertAdapterModel
 from transformers.testing_utils import require_torch
 
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
 from .test_common import AdapterModelTesterMixin
 
 
@@ -10,3 +20,39 @@ class BertAdapterModelTest(AdapterModelTesterMixin, BertModelTest):
     all_model_classes = (
         BertAdapterModel,
     )
+
+
+class BertAdapterTestBase(AdapterTestBase):
+    config_class = BertConfig
+    config = make_config(
+        BertConfig,
+        hidden_size=32,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        intermediate_size=37,
+    )
+    tokenizer_name = "bert-base-uncased"
+
+
+@require_torch
+class BertAdapterTest(
+    EmbeddingTestMixin,
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTrainingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    BertAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class BertClassConversionTest(
+    ModelClassConversionTestMixin,
+    BertAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_distilbert.py b/tests_adapters/test_distilbert.py
index d2eb8f1375..bdba3dbfde 100644
--- a/tests_adapters/test_distilbert.py
+++ b/tests_adapters/test_distilbert.py
@@ -1,7 +1,17 @@
+import unittest
+
 from tests.test_modeling_distilbert import *
 from transformers import DistilBertAdapterModel
 from transformers.testing_utils import require_torch
 
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
 from .test_common import AdapterModelTesterMixin
 
 
@@ -10,3 +20,39 @@ class DistilBertAdapterModelTest(AdapterModelTesterMixin, DistilBertModelTest):
     all_model_classes = (
         DistilBertAdapterModel,
     )
+
+
+class DistilBertAdapterTestBase(AdapterTestBase):
+    config_class = DistilBertConfig
+    config = make_config(
+        DistilBertConfig,
+        dim=32,
+        n_layers=4,
+        n_heads=4,
+        hidden_dim=37,
+    )
+    tokenizer_name = "distilbert-base-uncased"
+
+
+@require_torch
+class DistilBertAdapterTest(
+    AdapterModelTestMixin,
+    EmbeddingTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTrainingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    DistilBertAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class DistilBertClassConversionTest(
+    ModelClassConversionTestMixin,
+    DistilBertAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_encoder_decoder.py b/tests_adapters/test_encoder_decoder.py
index ff0dd8cbbf..55532003b4 100644
--- a/tests_adapters/test_encoder_decoder.py
+++ b/tests_adapters/test_encoder_decoder.py
@@ -1 +1,65 @@
-from tests.test_modeling_encoder_decoder import *
+import unittest
+
+from tests.test_modeling_encoder_decoder import *  # Imported to execute model tests
+from transformers import AutoModelForSeq2SeqLM, BertConfig
+
+from .test_adapter import AdapterTestBase
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+
+
+class EncoderDecoderAdapterTestBase(AdapterTestBase):
+    model_class = EncoderDecoderModel
+    config_class = EncoderDecoderConfig
+    config = staticmethod(
+        lambda: EncoderDecoderConfig.from_encoder_decoder_configs(
+            BertConfig(
+                hidden_size=32,
+                num_hidden_layers=4,
+                num_attention_heads=4,
+                intermediate_size=37,
+            ),
+            BertConfig(
+                hidden_size=32,
+                num_hidden_layers=4,
+                num_attention_heads=4,
+                intermediate_size=37,
+                is_decoder=True,
+                add_cross_attention=True,
+            ),
+        )
+    )
+    tokenizer_name = "bert-base-uncased"
+
+
+@require_torch
+class EncoderDecoderAdapterTest(
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    EncoderDecoderAdapterTestBase,
+    unittest.TestCase,
+):
+    def test_invertible_adapter_with_head(self):
+        """This test class is copied and adapted from the identically-named test in test_adapter_heads.py."""
+        model = AutoModelForSeq2SeqLM.from_config(self.config())
+        model.add_adapter("test", config="pfeiffer+inv")
+        model.set_active_adapters("test")
+
+        # Set a hook before the invertible adapter to make sure it's actually called twice:
+        # Once after the embedding layer and once in the prediction head.
+        calls = 0
+
+        def forward_pre_hook(module, input):
+            nonlocal calls
+            calls += 1
+
+        inv_adapter = model.base_model.get_invertible_adapter()
+        self.assertIsNotNone(inv_adapter)
+        inv_adapter.register_forward_pre_hook(forward_pre_hook)
+
+        in_data = self.get_input_samples((1, 128), config=model.config)
+        model.to(torch_device)
+        out = model(**in_data)
+
+        self.assertEqual((1, 128, model.config.decoder.vocab_size), out[0].shape)
+        self.assertEqual(2, calls)
diff --git a/tests_adapters/test_gpt2.py b/tests_adapters/test_gpt2.py
index 030499963a..eac05aed0a 100644
--- a/tests_adapters/test_gpt2.py
+++ b/tests_adapters/test_gpt2.py
@@ -1,7 +1,17 @@
+import unittest
+
 from tests.test_modeling_gpt2 import *
 from transformers import GPT2AdapterModel
 from transformers.testing_utils import require_torch
 
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
 from .test_common import AdapterModelTesterMixin
 
 
@@ -10,3 +20,40 @@ class GPT2AdapterModelTest(AdapterModelTesterMixin, GPT2ModelTest):
     all_model_classes = (
         GPT2AdapterModel,
     )
+
+
+class GPT2AdapterTestBase(AdapterTestBase):
+    config_class = GPT2Config
+    config = make_config(
+        GPT2Config,
+        n_embd=32,
+        n_layer=4,
+        n_head=4,
+        # set pad token to eos token
+        pad_token_id=50256,
+    )
+    tokenizer_name = "gpt2"
+
+
+@require_torch
+class GPT2AdapterTest(
+    AdapterModelTestMixin,
+    EmbeddingTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTrainingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    GPT2AdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class GPT2ClassConversionTest(
+    ModelClassConversionTestMixin,
+    GPT2AdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_mbart.py b/tests_adapters/test_mbart.py
index e94e6f6309..416c5b1f4d 100644
--- a/tests_adapters/test_mbart.py
+++ b/tests_adapters/test_mbart.py
@@ -1,7 +1,15 @@
+import unittest
+
 from tests.test_modeling_mbart import *
 from transformers import MBartAdapterModel
 from transformers.testing_utils import require_torch
 
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
 from .test_common import AdapterModelTesterMixin
 
 
@@ -10,3 +18,38 @@ class MBartAdapterModelTest(AdapterModelTesterMixin, MBartModelTest):
     all_model_classes = (
         MBartAdapterModel,
     )
+
+
+class MBartAdapterTestBase(AdapterTestBase):
+    config_class = MBartConfig
+    config = make_config(
+        MBartConfig,
+        d_model=16,
+        encoder_layers=2,
+        decoder_layers=2,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        encoder_ffn_dim=4,
+        decoder_ffn_dim=4,
+    )
+
+
+@require_torch
+class MBartAdapterTest(
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    MBartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class MBartClassConversionTest(
+    ModelClassConversionTestMixin,
+    MBartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_roberta.py b/tests_adapters/test_roberta.py
index c779b6b18e..bb4af5be6d 100644
--- a/tests_adapters/test_roberta.py
+++ b/tests_adapters/test_roberta.py
@@ -1,7 +1,15 @@
+import unittest
+
 from tests.test_modeling_roberta import *
 from transformers import RobertaAdapterModel
 from transformers.testing_utils import require_torch
 
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
 from .test_common import AdapterModelTesterMixin
 
 
@@ -10,3 +18,35 @@ class RobertaAdapterModelTest(AdapterModelTesterMixin, RobertaModelTest):
     all_model_classes = (
         RobertaAdapterModel,
     )
+
+
+class RobertaAdapterTestBase(AdapterTestBase):
+    config_class = RobertaConfig
+    config = make_config(
+        RobertaConfig,
+        hidden_size=32,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        intermediate_size=37,
+    )
+
+
+@require_torch
+class RobertaAdapterTest(
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    RobertaAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class RobertaClassConversionTest(
+    ModelClassConversionTestMixin,
+    RobertaAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_t5.py b/tests_adapters/test_t5.py
index 8876dab61e..41067c6cca 100644
--- a/tests_adapters/test_t5.py
+++ b/tests_adapters/test_t5.py
@@ -1,7 +1,19 @@
+import unittest
+
+from datasets import load_dataset
+
 from tests.test_modeling_t5 import *
 from transformers import T5AdapterModel
 from transformers.testing_utils import require_torch
 
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
 from .test_common import AdapterModelTesterMixin
 
 
@@ -10,3 +22,81 @@ class T5AdapterModelTest(AdapterModelTesterMixin, T5ModelTest):
     all_model_classes = (
         T5AdapterModel,
     )
+
+
+@require_torch
+class T5AdapterTestBase(AdapterTestBase):
+    config_class = T5Config
+    config = make_config(
+        T5Config,
+        d_model=16,
+        num_layers=2,
+        num_decoder_layers=2,
+        num_heads=4,
+        d_ff=4,
+        d_kv=16 // 4,
+        tie_word_embeddings=False,
+        decoder_start_token_id=0,
+    )
+    tokenizer_name = "t5-base"
+
+    def add_head(self, model, name, **kwargs):
+        model.add_seq2seq_lm_head(name)
+
+    def dataset(self, tokenizer):
+        def preprocess_function(examples):
+            inputs = examples["document"]
+            targets = examples["summary"]
+            inputs = ["Summarize: " + inp for inp in inputs]
+            model_inputs = tokenizer(inputs, padding=True, truncation=True)
+
+            # Setup the tokenizer for targets
+            with tokenizer.as_target_tokenizer():
+                labels = tokenizer(targets, padding=True, truncation=True)
+
+            # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+            # padding in the loss.
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+
+            model_inputs["labels"] = labels["input_ids"]
+            return model_inputs
+
+        data_args = {
+            "task_name": "xsum",
+            "path": "./tests/fixtures/tests_samples/xsum/sample.json",
+        }
+        dataset = load_dataset("json", data_files=data_args["path"])
+        train_dataset = dataset["train"]
+        train_dataset = train_dataset.map(
+            preprocess_function,
+            batched=True,
+            desc="Running tokenizer on train dataset",
+        )
+        return train_dataset
+
+
+@require_torch
+class T5AdapterTest(
+    T5AdapterTestBase,
+    EmbeddingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    AdapterTrainingTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class T5ClassConversionTest(
+    ModelClassConversionTestMixin,
+    T5AdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_xlm_roberta.py b/tests_adapters/test_xlm_roberta.py
new file mode 100644
index 0000000000..b141814dfc
--- /dev/null
+++ b/tests_adapters/test_xlm_roberta.py
@@ -0,0 +1,24 @@
+import unittest
+
+from transformers import XLMRobertaConfig
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_conversion import ModelClassConversionTestMixin
+
+
+@require_torch
+class XLMRobertaClassConversionTest(
+    ModelClassConversionTestMixin,
+    AdapterTestBase,
+    unittest.TestCase,
+):
+    config_class = XLMRobertaConfig
+    config = make_config(
+        XLMRobertaConfig,
+        hidden_size=32,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        intermediate_size=37,
+    )
+

From 819230e1732c23c86f0821cfe4a5f29f168db202 Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Thu, 17 Feb 2022 22:50:01 +0100
Subject: [PATCH 07/10] Replace `AutoModelWithHeads` with `AutoAdapterModel` in
 docs

---
 adapter_docs/adapter_composition.md    | 2 +-
 adapter_docs/huggingface_hub.md        | 4 ++--
 adapter_docs/loading.md                | 4 ++--
 adapter_docs/prediction_heads.md       | 4 ++--
 adapter_docs/training.md               | 4 ++--
 src/transformers/adapters/hub_mixin.py | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/adapter_docs/adapter_composition.md b/adapter_docs/adapter_composition.md
index edde41dfdd..d0c4d7b68e 100644
--- a/adapter_docs/adapter_composition.md
+++ b/adapter_docs/adapter_composition.md
@@ -175,7 +175,7 @@ In the following example, we load two adapters for semantic textual similarity (
 We activate a parallel setup where the input is passed through both adapters and their respective prediction heads.
 
 ```python
-model = AutoModelWithHeads.from_pretrained("distilbert-base-uncased")
+model = AutoAdapterModel.from_pretrained("distilbert-base-uncased")
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 
 adapter1 = model.load_adapter("sts/sts-b@ukp")
diff --git a/adapter_docs/huggingface_hub.md b/adapter_docs/huggingface_hub.md
index db09760126..86f37795ea 100644
--- a/adapter_docs/huggingface_hub.md
+++ b/adapter_docs/huggingface_hub.md
@@ -18,9 +18,9 @@ Alternatively, all adapters on the HuggingFace Model Hub are also listed on [htt
 After you have found an adapter you would like to use, loading it into a Transformer model is very similar to [loading adapters from AdapterHub](loading.md).
 For example, for loading and activating the adapter [`AdapterHub/roberta-base-pf-sick`](https://huggingface.co/AdapterHub/roberta-base-pf-sick), write:
 ```python
-from transformers import AutoModelWithHeads
+from transformers import AutoAdapterModel
 
-model = AutoModelWithHeads.from_pretrained("roberta-base")
+model = AutoAdapterModel.from_pretrained("roberta-base")
 adapter_name = model.load_adapter("AdapterHub/roberta-base-pf-sick", source="hf")
 model.active_adapters = adapter_name
 ```
diff --git a/adapter_docs/loading.md b/adapter_docs/loading.md
index 8812962c92..a93594729a 100644
--- a/adapter_docs/loading.md
+++ b/adapter_docs/loading.md
@@ -55,13 +55,13 @@ model.set_active_adapters(adapter_name)
 
 As the second example, let's have a look at how to load an adapter based on the [`AdapterInfo`](classes/adapter_utils.html#transformers.adapters.utils.AdapterInfo) returned by the [`list_adapters()`](classes/adapter_utils.html#transformers.adapters.utils.list_adapters) method from [above](#finding-pre-trained-adapters):
 ```python
-from transformers import AutoModelWithHeads, list_available_adapters
+from transformers import AutoAdapterModel, list_available_adapters
 
 adapter_infos = list_available_adapters(source="ah")
 # Take the first adapter info as an example
 adapter_info = adapter_infos[0]
 
-model = AutoModelWithHeads.from_pretrained(adapter_info.model_name)
+model = AutoAdapterModel.from_pretrained(adapter_info.model_name)
 model.load_adapter(adapter_info.adapter_id, source=adapter_info.source)
 ```
 
diff --git a/adapter_docs/prediction_heads.md b/adapter_docs/prediction_heads.md
index 2c7ab0d7bc..082edd741f 100644
--- a/adapter_docs/prediction_heads.md
+++ b/adapter_docs/prediction_heads.md
@@ -93,14 +93,14 @@ In case the classes match, our prediction head weights will be automatically loa
 
 ## Automatic conversion 
 
-Beginning with v2.1 of `adapter-transformers`, it is possible to load static heads, e.g. created with `AutoModelForSequenceClassification`, into model classes with flexible heads, e.g. `AutoModelWithHeads`.
+Beginning with v2.1 of `adapter-transformers`, it is possible to load static heads, e.g. created with `AutoModelForSequenceClassification`, into model classes with flexible heads, e.g. `AutoAdapterModel`.
 The conversion of weights happens automatically during the call of `load_adapter()`, so no additional steps are needed:
 ```python
 static_head_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
 static_head_model.add_adapter("test")
 static_head_model.save_adapter(temp_dir, "test")
 
-flex_head_model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
+flex_head_model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 flex_head_model.load_adapter(temp_dir)
 
 assert "test" in flex_head_model.config.adapters
diff --git a/adapter_docs/training.md b/adapter_docs/training.md
index f94a7fcca9..41f45e451d 100644
--- a/adapter_docs/training.md
+++ b/adapter_docs/training.md
@@ -16,11 +16,11 @@ pip install -r ./examples/<your_examples_folder>/requirements.txt
 Training a task adapter module on a dataset only requires minor modifications from training the full model.
 Suppose we have an existing script for training a Transformer model, here we will use HuggingFace's [run_glue.py](https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/text-classification/run_glue.py) example script for training on the GLUE dataset.
 
-In our example, we replaced the built-in `AutoModelForSequenceClassification` class with the `AutoModelWithHeads` class introduced by `adapter-transformers` (learn more about prediction heads [here](prediction_heads.md)).
+In our example, we replaced the built-in `AutoModelForSequenceClassification` class with the `AutoAdapterModel` class introduced by `adapter-transformers` (learn more about prediction heads [here](prediction_heads.md)).
 Therefore, the model instantiation changed to:
 
 ```python
-model = AutoModelWithHeads.from_pretrained(
+model = AutoAdapterModel.from_pretrained(
         model_args.model_name_or_path,
         config=config,
 )
diff --git a/src/transformers/adapters/hub_mixin.py b/src/transformers/adapters/hub_mixin.py
index b1df68b7fd..29babd6cf5 100644
--- a/src/transformers/adapters/hub_mixin.py
+++ b/src/transformers/adapters/hub_mixin.py
@@ -34,9 +34,9 @@
 Now, the adapter can be loaded and activated like this:
 
 ```python
-from transformers import AutoModelWithHeads
+from transformers import AutoAdapterModel
 
-model = AutoModelWithHeads.from_pretrained("{model_name}")
+model = AutoAdapterModel.from_pretrained("{model_name}")
 adapter_name = model.load_adapter("{adapter_repo_name}", source="hf", set_active=True)
 ```
 

From d2fc3228f0f629f94f7f060d69647fb9d699267b Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Mon, 21 Feb 2022 20:57:03 +0100
Subject: [PATCH 08/10] Use AdapterModel classes in more places

---
 adapter_docs/classes/models/bert.rst         |  2 +-
 adapter_docs/prediction_heads.md             | 42 ++++++++++----------
 adapter_docs/quickstart.md                   | 30 +++++++-------
 tests_adapters/test_adapter_hub.py           |  4 +-
 tests_adapters/test_adapter_save_id2label.py |  8 ++--
 5 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/adapter_docs/classes/models/bert.rst b/adapter_docs/classes/models/bert.rst
index 06695ad24d..e4c7d7d405 100644
--- a/adapter_docs/classes/models/bert.rst
+++ b/adapter_docs/classes/models/bert.rst
@@ -9,6 +9,6 @@ pre-trained using a combination of masked language modeling objective and next s
 BertAdapterModel
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.adapters.BertModelWithHeads
+.. autoclass:: transformers.adapters.BertAdapterModel
     :members:
     :inherited-members: BertPreTrainedModel
diff --git a/adapter_docs/prediction_heads.md b/adapter_docs/prediction_heads.md
index 082edd741f..98d955ba69 100644
--- a/adapter_docs/prediction_heads.md
+++ b/adapter_docs/prediction_heads.md
@@ -1,38 +1,31 @@
 # Prediction Heads
 
 This section gives an overview how different prediction heads can be used together with adapter modules and how pre-trained adapters can be distributed side-by-side with matching prediction heads in AdapterHub.
-We will take a look at our own new **model classes with flexible heads** (e.g. `BertModelWithHeads`) as well as **models with static heads** provided out-of-the-box by HuggingFace (e.g. `BertForSequenceClassification`).
+We will take a look at the `AdapterModel` classes (e.g. `BertAdapterModel`) introduced by adapter-transformers, which provide **flexible** support for prediction heads, as well as models with **static** heads provided out-of-the-box by HuggingFace Transformers (e.g. `BertForSequenceClassification`).
 
 ```eval_rst
 .. tip::
-    We recommend to use the `model classes with flexible heads <#models-with-flexible-heads>`_ whenever possible.
+    We recommend to use the `AdapterModel classes <#adaptermodel-classes>`_ whenever possible. 
     They have been created specifically for working with adapters and provide more flexibility.
 ```
 
-```eval_rst
-.. important::
-    Although the two prediction head implementations serve the same use case, their weights are *not* directly compatible, i.e. you cannot load a head created with ``AutoModelWithHeds`` into a model of type ``AutoModelForSequenceClassification``.
-    There is however an `automatic conversion to model classes with flexible heads <#automatic-conversion>`_.
-```
+## AdapterModel classes
 
-## Models with flexible heads
+The AdapterModel classes provided by `adapter-transformers` allow a flexible configuration of prediction heads on top of a pre-trained language model.
 
-To allow for prediction heads to be configured in a flexible way on top of a pre-trained language model, `adapter-transformers` provides a new line of model classes.
-These classes follow the naming schema `<model_class>WithHeads` and are available for all model classes supporting adapters. Let's see how they work:
-
-First, we load pre-trained model from HuggingFace:
+First, we load pre-trained model from the HuggingFace Hub via the [`AutoAdapterModel`](classes/models/auto.html#transformers.adapters.AutoAdapterModel) class:
 ```python
-model = BertModelWithHeads.from_pretrained("bert-base-uncased")
+model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 ```
 
-Although we use the class `BertModelWithHeads`, this model doesn't have any heads yet. We add a new one in the next step:
+By default, this model doesn't have any heads yet. We add a new one in the next step:
 ```python
 model.add_classification_head("mrpc", num_labels=2)
 ```
 The line above adds a binary sequence classification head on top of our model.
 As this head is named, we could add multiple other heads with different names to the same model.
 This is especially useful if used together with matching adapter modules.
-For more about the different head types and the configuration options, refer to the class references of the respective model classes, e.g. [BertModelWithHeads](classes/models/bert.html#transformers.BertModelWithHeads).
+To learn more about the different head types and the configuration options, please refer to the class references of the respective model classes, e.g. [`BertAdapterModel`](classes/models/bert.html#transformers.adapters.BertAdapterModel).
 
 Now, of course, we would like to train our classification head together with an adapter, so let's add one:
 ```python
@@ -49,9 +42,10 @@ At this point, we can start to [train our setup](training.md).
     The ``set_active_adapters()`` will search for an adapter and a prediction head with the given name to be activated.
     Alternatively, prediction heads can also be activated explicitly (i.e. without adapter modules).
     These three options are possible (in order of priority when multiple are specified):
-        1. If ``head`` is passed to the forward call, the head with the given name is used.
-        2. If the forward call is executed within an ``AdapterSetup`` context, the head configuration is read from the context.
-        3. If the ``active_head`` property is set, the head configuration is read from there.
+
+    1. If ``head`` is passed to the forward call, the head with the given name is used.
+    2. If the forward call is executed within an ``AdapterSetup`` context, the head configuration is read from the context.
+    3. If the ``active_head`` property is set, the head configuration is read from there.
 ```
 
 After training has completed, we can save our whole setup (adapter module _and_ prediction head), with a single call:
@@ -59,8 +53,8 @@ After training has completed, we can save our whole setup (adapter module _and_
 model.save_adapter("/path/to/dir", "mrpc", with_head=True)
 ```
 
-Now, we just have to [share our work with the world](contributing.html#add-your-pre-trained-adapter).
-After we published our adapter together with its head in the Hub, anyone else can load both adapter and head by using the same model class.
+Now, you just have to [share your work with the world](contributing.html#add-your-pre-trained-adapter).
+After you published our adapter together with its head in the Hub, anyone else can load both adapter and head by using the same model class.
 
 Alternatively, we can also save and load the prediction head separately from an adapter module:
 
@@ -77,7 +71,7 @@ Lastly, it's also possible to delete an added head again:
 model.delete_head("mrpc")
 ```
 
-## Model with static heads (HuggingFace heads)
+## Model classes with static heads (HuggingFace Transformers)
 
 The `transformers` library provides strongly typed model classes with heads for various different tasks (e.g. `RobertaForSequenceClassification`, `AutoModelForMultipleChoice` ...).
 If an adapter module is trained with one these out-of-the-box classes, it is encouraged to also distribute the prediction head weights together with the adapter weights.
@@ -93,6 +87,12 @@ In case the classes match, our prediction head weights will be automatically loa
 
 ## Automatic conversion 
 
+```eval_rst
+.. important::
+    Although the two prediction head implementations serve the same use case, their weights are *not* directly compatible, i.e. you cannot load a head created with ``AutoAdapterModel`` into a model of type ``AutoModelForSequenceClassification``.
+    There is however an automatic conversion to model classes with flexible heads.
+```
+
 Beginning with v2.1 of `adapter-transformers`, it is possible to load static heads, e.g. created with `AutoModelForSequenceClassification`, into model classes with flexible heads, e.g. `AutoAdapterModel`.
 The conversion of weights happens automatically during the call of `load_adapter()`, so no additional steps are needed:
 ```python
diff --git a/adapter_docs/quickstart.md b/adapter_docs/quickstart.md
index 6a10aa4352..a0a7b74c75 100644
--- a/adapter_docs/quickstart.md
+++ b/adapter_docs/quickstart.md
@@ -21,31 +21,29 @@ The following example shows the usage of a basic pre-trained transformer model w
 Our goal here is to predict the sentiment of a given sentence.
 
 We use BERT in this example, so we first load a pre-trained `BertTokenizer` to encode the input sentence and a pre-trained
-`BertModel` from HuggingFace:
+`bert-base-uncased` checkpoint from HuggingFace's Model Hub using the [`BertAdapterModel`](classes/models/bert.html#transformers.adapters.BertAdapterModel) class:
 
 ```python
 import torch
-from transformers import BertTokenizer, BertForSequenceClassification
+from transformers import BertTokenizer
+from transformers.adapters import BertAdapterModel
 
-# output more information
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# load pre-trained BERT tokenizer from Huggingface
+# Load pre-trained BERT tokenizer from Huggingface.
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
-# tokenize an input sentence
+# An input sentence.
 sentence = "It's also, clearly, great fun."
 
-# convert input tokens to indices and create PyTorch input tensor
-input_tensor = torch.tensor([tokenizer.encode(sentence)])
+# Tokenize the input sentence and create a PyTorch input tensor.
+input_data = tokenizer(sentence, return_tensors='pytorch')
 
-# load pre-trained BERT model from Huggingface
-# the `BertForSequenceClassification` class includes a prediction head for sequence classification
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+# Load pre-trained BERT model from HuggingFace Hub.
+# The `BertAdapterModel` class is specifically designed for working with adapters.
+# It can be used with different prediction heads.
+model = BertAdapterModel.from_pretrained('bert-base-uncased')
 ```
 
-Having loaded the model, we now add a pre-trained task adapter that is useful to our task from Adapter Hub.
+Having loaded the model, we now add a pre-trained task adapter that is useful to our task from AdapterHub.
 As we're doing sentiment classification, we use [an adapter trained on the SST-2 dataset](https://adapterhub.ml/adapters/ukp/bert-base-uncased_sentiment_sst-2_pfeiffer/) in this case.
 The task prediction head loaded together with the adapter gives us a class label for our sentence:
 
@@ -58,7 +56,7 @@ adapter_name = model.load_adapter('sst-2@ukp', config='pfeiffer')
 model.set_active_adapters(adapter_name)
 
 # predict output tensor
-outputs = model(input_tensor)
+outputs = model(**input_data)
 
 # retrieve the predicted class label
 predicted = torch.argmax(outputs[0]).item()
@@ -74,7 +72,7 @@ model.save_pretrained('./path/to/model/directory/')
 model.save_adapter('./path/to/adapter/directory/', 'sst-2')
 
 # load model
-model = BertModel.from_pretrained('./path/to/model/directory/')
+model = AutoAdapterModel.from_pretrained('./path/to/model/directory/')
 model.load_adapter('./path/to/adapter/directory/')
 ```
 
diff --git a/tests_adapters/test_adapter_hub.py b/tests_adapters/test_adapter_hub.py
index 1235e93b6f..150198df07 100644
--- a/tests_adapters/test_adapter_hub.py
+++ b/tests_adapters/test_adapter_hub.py
@@ -9,8 +9,8 @@
     AdapterConfig,
     AutoModel,
     AutoTokenizer,
+    BertAdapterModel,
     BertForSequenceClassification,
-    BertModelWithHeads,
     GlueDataset,
     GlueDataTrainingArguments,
     TrainingArguments,
@@ -139,7 +139,7 @@ def test_load_lang_adapter_from_hub(self):
                 self.assertEqual([1, 128, 768], list(output[0].size()))
 
     def test_load_adapter_with_head_from_hub(self):
-        model = BertModelWithHeads.from_pretrained("bert-base-uncased")
+        model = BertAdapterModel.from_pretrained("bert-base-uncased")
 
         loading_info = {}
         adapter_name = model.load_adapter(
diff --git a/tests_adapters/test_adapter_save_id2label.py b/tests_adapters/test_adapter_save_id2label.py
index 95b94bf37b..5807b93c2e 100644
--- a/tests_adapters/test_adapter_save_id2label.py
+++ b/tests_adapters/test_adapter_save_id2label.py
@@ -2,7 +2,7 @@
 from tempfile import TemporaryDirectory
 from typing import Dict
 
-from transformers import BertConfig, BertForSequenceClassification, BertModelWithHeads
+from transformers import BertAdapterModel, BertConfig, BertForSequenceClassification
 
 
 def get_default(num_label):
@@ -62,7 +62,7 @@ def test_sequ_classification_model_head_labels(self):
         self.assertDictEqual(self.label_map, model.get_labels_dict())
 
     def test_model_with_heads_tagging_head_labels(self):
-        model = BertModelWithHeads(self.config)
+        model = BertAdapterModel(self.config)
         model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map)
         with TemporaryDirectory() as temp_dir:
             model.save_head(temp_dir, "test_head")
@@ -74,7 +74,7 @@ def test_model_with_heads_tagging_head_labels(self):
         self.assertDictEqual(self.label_map, model.get_labels_dict())
 
     def test_multiple_heads_label(self):
-        model = BertModelWithHeads(self.config)
+        model = BertAdapterModel(self.config)
         model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map)
         with TemporaryDirectory() as temp_dir:
             model.save_head(temp_dir, "test_head")
@@ -88,7 +88,7 @@ def test_multiple_heads_label(self):
         self.assertEqual(model.get_labels_dict("classification_head"), default_label_dict)
 
     def test_model_with_heads_multiple_heads(self):
-        model = BertModelWithHeads(self.config)
+        model = BertAdapterModel(self.config)
         model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map)
         model.add_classification_head("second_head", num_labels=5)
         with TemporaryDirectory() as temp_dir:

From 31dc4f046c5544a3d161f998c51ba5371f070f3e Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Mon, 21 Feb 2022 21:28:54 +0100
Subject: [PATCH 09/10] Fix model class conversion

---
 src/transformers/adapters/loading.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/transformers/adapters/loading.py b/src/transformers/adapters/loading.py
index de54f92538..2f17ea36b3 100644
--- a/src/transformers/adapters/loading.py
+++ b/src/transformers/adapters/loading.py
@@ -677,6 +677,16 @@ def load(self, save_directory, load_as=None, loading_info=None, **kwargs):
                 if self.model.__class__.__name__ == config["model_class"]:
                     head_name = load_as or config["name"]
                     head_config = config["config"]
+                elif config["model_class"].endswith("ModelWithHeads"):
+                    this_class = self.model.__class__.__name__.replace("AdapterModel", "")
+                    other_class = config["model_class"].replace("ModelWithHeads", "")
+                    if this_class == other_class:
+                        head_name = load_as or config["name"]
+                        head_config = config["config"]
+                    else:
+                        raise ValueError(
+                            f"Cannot automatically convert prediction head of model class {config['model_class']} to flex head."
+                        )
                 # try to convert a static head to a flex head
                 elif self.convert_to_flex_head and config["model_class"] in STATIC_TO_FLEX_HEAD_MAP:
                     head_name = kwargs.pop("main_load_name", load_as)

From 9f6db48eca817e189584f36cc9f6dbe11f330a6f Mon Sep 17 00:00:00 2001
From: calpt <36051308+calpt@users.noreply.github.com>
Date: Wed, 23 Feb 2022 09:50:59 +0100
Subject: [PATCH 10/10] model_overview doc review

---
 adapter_docs/model_overview.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adapter_docs/model_overview.md b/adapter_docs/model_overview.md
index dd57df7af6..23ec0673a4 100644
--- a/adapter_docs/model_overview.md
+++ b/adapter_docs/model_overview.md
@@ -22,7 +22,7 @@ The table below further shows which model architectures support which adaptation
 | [T5](classes/models/t5.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [XLM-RoBERTa](classes/models/xlmroberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 
-(*) Depending on the used encoder and decoder model class.
+(*) If the used encoder and decoder model class are supported.
 
 **Missing a model architecture you'd like to use?**
 adapter-transformers can be easily extended to new model architectures as described in [Adding Adapters to a Model](https://github.com/Adapter-Hub/adapter-transformers/blob/master/adding_adapters_to_a_model.md).