diff --git a/.github/workflows/tests_torch.yml b/.github/workflows/tests_torch.yml
index b9483e2e8..c3fba80ea 100644
--- a/.github/workflows/tests_torch.yml
+++ b/.github/workflows/tests_torch.yml
@@ -60,4 +60,4 @@ jobs:
           pip install datasets
       - name: Test
         run: |
-          make test-reduced
+          make test-adapters
diff --git a/Makefile b/Makefile
index bcebdd804..1f1f3fb9e 100644
--- a/Makefile
+++ b/Makefile
@@ -82,16 +82,8 @@ test:
 
 # Run the adapter tests
 
-test-adapter:
-	python -m pytest -n auto --dist=loadfile -s -v\
-		-k test_adapter\
-		--ignore-glob='tests/test_tokenization*'\
-		--ignore-glob='tests/test_processor*'\
-		./tests/
-
-# Run a reduced test suite in the CI pipeline of adapter-transformers
-test-reduced:
-	python utils/run_tests.py
+test-adapters:
+	python -m pytest -n auto --dist=loadfile -s -v ./tests_adapters/
 
 # Run tests for examples
 
diff --git a/README.md b/README.md
index 1127547a2..b918cc3af 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,9 @@ To get started with adapters, refer to these locations:
 - **https://adapterhub.ml** to explore available pre-trained adapter modules and share your own adapters
 - **[Examples folder](https://github.com/Adapter-Hub/adapter-transformers/tree/master/examples)** of this repository containing HuggingFace's example training scripts, many adapted for training adapters
 
+## Supported Models
+
+We currently support the PyTorch versions of all models listed on the **[Model Overview](https://docs.adapterhub.ml/model_overview.html) page** in our documentation.
 
 ## Citation
 
diff --git a/adapter_docs/adapter_composition.md b/adapter_docs/adapter_composition.md
index edde41dfd..d0c4d7b68 100644
--- a/adapter_docs/adapter_composition.md
+++ b/adapter_docs/adapter_composition.md
@@ -175,7 +175,7 @@ In the following example, we load two adapters for semantic textual similarity (
 We activate a parallel setup where the input is passed through both adapters and their respective prediction heads.
 
 ```python
-model = AutoModelWithHeads.from_pretrained("distilbert-base-uncased")
+model = AutoAdapterModel.from_pretrained("distilbert-base-uncased")
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 
 adapter1 = model.load_adapter("sts/sts-b@ukp")
diff --git a/adapter_docs/classes/models/auto.rst b/adapter_docs/classes/models/auto.rst
new file mode 100644
index 000000000..ae7fec236
--- /dev/null
+++ b/adapter_docs/classes/models/auto.rst
@@ -0,0 +1,11 @@
+Auto Classes
+============
+
+Similar to the ``AutoModel`` classes built-in into HuggingFace Transformers, adapter-transformers provides an ``AutoAdapterModel`` class.
+As with other auto classes, the correct adapter model class is automatically instantiated based on the pre-trained model passed to the ``from_pretrained()`` method.
+
+AutoAdapterModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.adapters.AutoAdapterModel
+    :members:
diff --git a/adapter_docs/classes/models/bart.rst b/adapter_docs/classes/models/bart.rst
index 414989c1c..86630fd3c 100644
--- a/adapter_docs/classes/models/bart.rst
+++ b/adapter_docs/classes/models/bart.rst
@@ -16,57 +16,10 @@ According to the abstract,
   state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
   of up to 6 ROUGE.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of BART in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/bart.html>`_.
 
-
-BartConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartConfig
-    :members:
-
-
-BartTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartTokenizer
-    :members:
-
-
-
-BartModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartModel
-    :members: forward
-
-
-BartModelWithHeads
+BartAdapterModel
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.BartModelWithHeads
+.. autoclass:: transformers.adapters.BartAdapterModel
     :members:
     :inherited-members: BartPretrainedModel
-
-
-BartForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForConditionalGeneration
-    :members: forward
-
-
-BartForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForSequenceClassification
-    :members: forward
-
-
-BartForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForQuestionAnswering
-    :members: forward
diff --git a/adapter_docs/classes/models/bert.rst b/adapter_docs/classes/models/bert.rst
index 6898c5c2d..e4c7d7d40 100644
--- a/adapter_docs/classes/models/bert.rst
+++ b/adapter_docs/classes/models/bert.rst
@@ -5,84 +5,10 @@ The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transfo
 by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It is a bidirectional transformer
 pre-trained using a combination of masked language modeling objective and next sentence prediction.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of BERT in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/bert.html>`_.
 
-BertConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertConfig
-    :members:
-
-
-BertTokenizer
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-BertModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertModel
-    :members:
-
-
-BertModelWithHeads
+BertAdapterModel
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.BertModelWithHeads
+.. autoclass:: transformers.adapters.BertAdapterModel
     :members:
     :inherited-members: BertPreTrainedModel
-
-
-BertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForPreTraining
-    :members:
-
-
-BertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForMaskedLM
-    :members:
-
-
-BertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForNextSentencePrediction
-    :members:
-
-
-BertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForSequenceClassification
-    :members:
-
-
-BertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForMultipleChoice
-    :members:
-
-
-BertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForTokenClassification
-    :members:
-
-
-BertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertForQuestionAnswering
-    :members:
diff --git a/adapter_docs/classes/models/distilbert.rst b/adapter_docs/classes/models/distilbert.rst
index 053a827cf..ec12de676 100644
--- a/adapter_docs/classes/models/distilbert.rst
+++ b/adapter_docs/classes/models/distilbert.rst
@@ -8,63 +8,10 @@ DistilBERT is a small, fast, cheap and light Transformer model trained by distil
 parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on
 the GLUE language understanding benchmark.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of DistilBERT in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/distilbert.html>`_.
 
-
-DistilBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertConfig
-    :members:
-
-
-DistilBertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertTokenizer
-    :members:
-
-
-DistilBertTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertTokenizerFast
-    :members:
-
-
-DistilBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertModel
-    :members:
-
-
-DistilBertModelWithHeads
+DistilBertAdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.DistilBertModelWithHeads
+.. autoclass:: transformers.adapters.DistilBertAdapterModel
     :members:
     :inherited-members: DistilBertPreTrainedModel
-
-
-DistilBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForMaskedLM
-    :members:
-
-
-DistilBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForSequenceClassification
-    :members:
-
-
-DistilBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DistilBertForQuestionAnswering
-    :members:
diff --git a/adapter_docs/classes/models/encoderdecoder.rst b/adapter_docs/classes/models/encoderdecoder.rst
index 7dd740b01..1e0f78ab0 100644
--- a/adapter_docs/classes/models/encoderdecoder.rst
+++ b/adapter_docs/classes/models/encoderdecoder.rst
@@ -31,12 +31,6 @@ and decoder for a summarization model as was shown in: `Text Summarization with
     This class is nearly identical to the PyTorch implementation of DistilBERT in Huggingface Transformers.
     For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/distilbert.html>`_.
 
-EncoderDecoderConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.EncoderDecoderConfig
-    :members:
-
 
 EncoderDecoderModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/adapter_docs/classes/models/gpt2.rst b/adapter_docs/classes/models/gpt2.rst
index 7c03e7bd0..bb0917dbb 100644
--- a/adapter_docs/classes/models/gpt2.rst
+++ b/adapter_docs/classes/models/gpt2.rst
@@ -1,9 +1,6 @@
 OpenAI GPT2
 -----------------------------------------------------------------------------------------------------------------------
 
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 OpenAI GPT-2 model was proposed in `Language Models are Unsupervised Multitask Learners
 <https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_ by Alec
 Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
@@ -17,86 +14,10 @@ text. The diversity of the dataset causes this simple goal to contain naturally
 across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than
 10X the amount of data.*
 
-Tips:
-
-- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
-  observed in the `run_generation.py` example script.
-- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
-  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
-  this argument.
-
-`Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
-Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
-different sizes: small, medium, large, xl and a distilled version of the small checkpoint: `distilgpt-2`.
-
-.. note::
-    This class is nearly identical to the PyTorch implementation of BERT in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/bert.html>`_.
-
 
-GPT2Config
+GPT2AdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.GPT2Config
-    :members:
-
-
-GPT2Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2Tokenizer
-    :members: save_vocabulary
-
-
-GPT2TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2TokenizerFast
-    :members:
-
-
-GPT2 specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput
-    :members:
-
-
-GPT2Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2Model
-    :members: forward
-
-
-GPT2ModelWithHeads
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2ModelWithHeads
+.. autoclass:: transformers.adapters.GPT2AdapterModel
     :members:
     :inherited-members: GPT2PreTrainedModel
-
-
-GPT2LMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2LMHeadModel
-    :members: forward
-
-
-GPT2DoubleHeadsModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2DoubleHeadsModel
-    :members: forward
-
-
-GPT2ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPT2ForSequenceClassification
-    :members: forward
diff --git a/adapter_docs/classes/models/mbart.rst b/adapter_docs/classes/models/mbart.rst
index d2ab6f5c2..bc9106581 100644
--- a/adapter_docs/classes/models/mbart.rst
+++ b/adapter_docs/classes/models/mbart.rst
@@ -10,83 +10,10 @@ corpora in many languages using the BART objective. mBART is one of the first me
 sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
 on the encoder, decoder, or reconstructing parts of the text.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of MBart in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/mbart.html>`_.
 
-
-MBartConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartConfig
-    :members:
-
-
-MBartTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartTokenizer
-    :members: as_target_tokenizer, build_inputs_with_special_tokens
-
-
-MBartTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartTokenizerFast
-    :members:
-
-
-MBart50Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBart50Tokenizer
-    :members:
-
-
-MBart50TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBart50TokenizerFast
-    :members:
-
-
-MBartModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartModel
-    :members:
-
-
-MBartModelWithHeads
+MBartAdapterModel
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartModelWithHeads
+.. autoclass:: transformers.adapters.MBartAdapterModel
     :members:
     :inherited-members: MBartPreTrainedModel
-
-
-MBartForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartForConditionalGeneration
-    :members:
-
-
-MBartForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartForQuestionAnswering
-    :members:
-
-
-MBartForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartForSequenceClassification
-
-
-MBartForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBartForCausalLM
-    :members: forward
diff --git a/adapter_docs/classes/models/roberta.rst b/adapter_docs/classes/models/roberta.rst
index 3026733b6..3e429d369 100644
--- a/adapter_docs/classes/models/roberta.rst
+++ b/adapter_docs/classes/models/roberta.rst
@@ -5,56 +5,10 @@ The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretrainin
 by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
 Veselin Stoyanov. It is based on Google's BERT model released in 2018.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of RoBERTa in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/roberta.html>`_.
 
-RobertaConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaConfig
-    :members:
-
-
-RobertaTokenizer
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-RobertaModel
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaModel
-    :members:
-
-
-RobertaModelWithHeads
+RobertaAdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.RobertaModelWithHeads
+.. autoclass:: transformers.adapters.RobertaAdapterModel
     :members:
     :inherited-members: RobertaPreTrainedModel
-
-
-RobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForMaskedLM
-    :members:
-
-
-RobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForSequenceClassification
-    :members:
-
-
-RobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RobertaForTokenClassification
-    :members:
diff --git a/adapter_docs/classes/models/t5.rst b/adapter_docs/classes/models/t5.rst
index 79b79034d..b8be5993b 100644
--- a/adapter_docs/classes/models/t5.rst
+++ b/adapter_docs/classes/models/t5.rst
@@ -16,54 +16,10 @@ The abstract from the paper is the following,
   For more information about which prefix to use, it is easiest to look into Appendix D of the `paper
   <https://arxiv.org/pdf/1910.10683.pdf>`__.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of T5 in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/t5.html>`_.
-  
-T5Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.T5Config
-    :members:
-
-
-T5Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5Tokenizer
-    :members:
-
-
-T5TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5TokenizerFast
-    :members:
-
-
-T5Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5Model
-    :members: forward
-
-T5ModelWithHeads
+T5AdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.T5ModelWithHeads
+.. autoclass:: transformers.adapters.T5AdapterModel
     :members:
     :inherited-members: T5PreTrainedModel
-
-
-
-T5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5ForConditionalGeneration
-    :members: forward
-
-T5EncoderModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.T5EncoderModel
-    :members: forward
diff --git a/adapter_docs/classes/models/xlmroberta.rst b/adapter_docs/classes/models/xlmroberta.rst
index 6ed7f3d1d..6a0c8c928 100644
--- a/adapter_docs/classes/models/xlmroberta.rst
+++ b/adapter_docs/classes/models/xlmroberta.rst
@@ -6,62 +6,9 @@ by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaum
 Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
 It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
 
-.. note::
-    This class is nearly identical to the PyTorch implementation of XLM-RoBERTa in Huggingface Transformers.
-    For more information, visit `the corresponding section in their documentation <https://huggingface.co/transformers/model_doc/xlmroberta.html>`_.
 
-XLMRobertaConfig
+XLMRobertaAdapterModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.XLMRobertaConfig
-    :members:
-
-
-XLMRobertaTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-XLMRobertaModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaModel
-    :members:
-
-
-XLMRobertaModelWithHeads
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaModelWithHeads
-    :members:
-
-
-XLMRobertaForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForMaskedLM
-    :members:
-
-
-XLMRobertaForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForSequenceClassification
-    :members:
-
-
-XLMRobertaForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForMultipleChoice
-    :members:
-
-
-XLMRobertaForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLMRobertaForTokenClassification
+.. autoclass:: transformers.adapters.XLMRobertaAdapterModel
     :members:
diff --git a/adapter_docs/classes/weights_loaders.rst b/adapter_docs/classes/weights_loaders.rst
deleted file mode 100644
index e7b14bd63..000000000
--- a/adapter_docs/classes/weights_loaders.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-Weights Loaders
-=======================
-
-These classes perform the extraction, saving and loading of module weights to and from the file system.
-All type-specific loader classes inherit from the common ``WeightsLoader`` base class which can also be extended
-to add support for additional custom modules.
-
-These classes provide the basis of adapter module integration into model classes such as adapter saving and loading.
-Depending on the model, one of these mixins should be implemented by every adapter-supporting model class.
-
-WeightsLoader
-------------------
-
-.. autoclass:: transformers.WeightsLoader
-    :members:
-
-AdapterLoader
----------------------------
-
-.. autoclass:: transformers.AdapterLoader
-    :members:
-
-AdapterFusionLoader
----------------------------
-.. autoclass:: transformers.AdapterFusionLoader
-    :members:
-
-PredictionHeadLoader
----------------------------
-
-.. autoclass:: transformers.PredictionHeadLoader
-    :members:
-
-WeightsLoaderHelper
--------------------
-
-.. autoclass:: transformers.WeightsLoaderHelper
-    :members:
diff --git a/adapter_docs/conf.py b/adapter_docs/conf.py
index 85543b34b..e021e8361 100644
--- a/adapter_docs/conf.py
+++ b/adapter_docs/conf.py
@@ -42,6 +42,7 @@
     "sphinx.ext.napoleon",
     "sphinx_copybutton",
     "sphinx_multiversion",
+    "sphinx_markdown_tables",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/adapter_docs/huggingface_hub.md b/adapter_docs/huggingface_hub.md
index db0976012..86f37795e 100644
--- a/adapter_docs/huggingface_hub.md
+++ b/adapter_docs/huggingface_hub.md
@@ -18,9 +18,9 @@ Alternatively, all adapters on the HuggingFace Model Hub are also listed on [htt
 After you have found an adapter you would like to use, loading it into a Transformer model is very similar to [loading adapters from AdapterHub](loading.md).
 For example, for loading and activating the adapter [`AdapterHub/roberta-base-pf-sick`](https://huggingface.co/AdapterHub/roberta-base-pf-sick), write:
 ```python
-from transformers import AutoModelWithHeads
+from transformers import AutoAdapterModel
 
-model = AutoModelWithHeads.from_pretrained("roberta-base")
+model = AutoAdapterModel.from_pretrained("roberta-base")
 adapter_name = model.load_adapter("AdapterHub/roberta-base-pf-sick", source="hf")
 model.active_adapters = adapter_name
 ```
diff --git a/adapter_docs/index.rst b/adapter_docs/index.rst
index c337fb759..46c1976e6 100644
--- a/adapter_docs/index.rst
+++ b/adapter_docs/index.rst
@@ -19,7 +19,7 @@ The *adapter-transformers* section documents the integration of adapters into th
 
 The section on *Adapter-Hub* describes the fundamentals of the pre-trained adapter repository and how to contribute new adapters.
 
-Currently, we support the PyTorch versions of all models listed in the *Supported Models* section.
+Currently, we support the PyTorch versions of all models as listed on the `Model Overview <model_overview.html>`_ page.
 
 .. toctree::
    :maxdepth: 2
@@ -43,22 +43,12 @@ Currently, we support the PyTorch versions of all models listed in the *Supporte
    contributing
    huggingface_hub
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Adapter-Related Classes
-
-   classes/adapter_config
-   classes/model_adapters_config
-   classes/adapter_modules
-   classes/adapter_layer
-   classes/model_mixins
-   classes/adapter_utils
-   classes/weights_loaders
-
 .. toctree::
    :maxdepth: 1
    :caption: Supported Models
 
+   model_overview
+   classes/models/auto
    classes/models/bart
    classes/models/bert
    classes/models/distilbert
@@ -69,6 +59,17 @@ Currently, we support the PyTorch versions of all models listed in the *Supporte
    classes/models/t5
    classes/models/xlmroberta
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Adapter-Related Classes
+
+   classes/adapter_config
+   classes/model_adapters_config
+   classes/adapter_modules
+   classes/adapter_layer
+   classes/model_mixins
+   classes/adapter_utils
+
 
 Citation
 ========
diff --git a/adapter_docs/loading.md b/adapter_docs/loading.md
index 8812962c9..a93594729 100644
--- a/adapter_docs/loading.md
+++ b/adapter_docs/loading.md
@@ -55,13 +55,13 @@ model.set_active_adapters(adapter_name)
 
 As the second example, let's have a look at how to load an adapter based on the [`AdapterInfo`](classes/adapter_utils.html#transformers.adapters.utils.AdapterInfo) returned by the [`list_adapters()`](classes/adapter_utils.html#transformers.adapters.utils.list_adapters) method from [above](#finding-pre-trained-adapters):
 ```python
-from transformers import AutoModelWithHeads, list_available_adapters
+from transformers import AutoAdapterModel, list_available_adapters
 
 adapter_infos = list_available_adapters(source="ah")
 # Take the first adapter info as an example
 adapter_info = adapter_infos[0]
 
-model = AutoModelWithHeads.from_pretrained(adapter_info.model_name)
+model = AutoAdapterModel.from_pretrained(adapter_info.model_name)
 model.load_adapter(adapter_info.adapter_id, source=adapter_info.source)
 ```
 
diff --git a/adapter_docs/model_overview.md b/adapter_docs/model_overview.md
new file mode 100644
index 000000000..23ec0673a
--- /dev/null
+++ b/adapter_docs/model_overview.md
@@ -0,0 +1,30 @@
+# Model Overview
+
+This page gives an overview of the Transformer models currently supported by `adapter-transformers`.
+The table below further shows which model architectures support which adaptation methods and which features of `adapter-transformers`.
+
+```eval_rst
+.. note::
+    Each supported model architecture X typically provides a class ``XAdapterModel`` for usage with ``AutoAdapterModel``.
+    Additionally, it is possible to use adapters with the model classes already shipped with HuggingFace Transformers.
+    E.g., for BERT, this means adapter-transformers provides a ``BertAdapterModel`` class, but you can also use ``BertModel``, ``BertForSequenceClassification`` etc. together with adapters.
+```
+
+| Model                                   | (Bottleneck)<br> Adapters | Prefix<br> Tuning | Compacter | Adapter<br> Fusion | Invertible<br> Adapters | Parallel<br> block |
+| --------------------------------------- | -| - | - | - | - | - |
+| [BART](classes/models/bart.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [BERT](classes/models/bert.html)        | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | |
+| [GPT-2](classes/models/gpt2.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [MBart](classes/models/mbart.html)      | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [RoBERTa](classes/models/roberta.html)  | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [T5](classes/models/t5.html)            | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [XLM-RoBERTa](classes/models/xlmroberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+(*) If the used encoder and decoder model class are supported.
+
+**Missing a model architecture you'd like to use?**
+adapter-transformers can be easily extended to new model architectures as described in [Adding Adapters to a Model](https://github.com/Adapter-Hub/adapter-transformers/blob/master/adding_adapters_to_a_model.md).
+Feel free to [open an issue](https://github.com/Adapter-Hub/adapter-transformers/issues) requesting support for a new architecture.
+_We very much welcome pull requests adding new model implementations!_
diff --git a/adapter_docs/prediction_heads.md b/adapter_docs/prediction_heads.md
index 2c7ab0d7b..98d955ba6 100644
--- a/adapter_docs/prediction_heads.md
+++ b/adapter_docs/prediction_heads.md
@@ -1,38 +1,31 @@
 # Prediction Heads
 
 This section gives an overview how different prediction heads can be used together with adapter modules and how pre-trained adapters can be distributed side-by-side with matching prediction heads in AdapterHub.
-We will take a look at our own new **model classes with flexible heads** (e.g. `BertModelWithHeads`) as well as **models with static heads** provided out-of-the-box by HuggingFace (e.g. `BertForSequenceClassification`).
+We will take a look at the `AdapterModel` classes (e.g. `BertAdapterModel`) introduced by adapter-transformers, which provide **flexible** support for prediction heads, as well as models with **static** heads provided out-of-the-box by HuggingFace Transformers (e.g. `BertForSequenceClassification`).
 
 ```eval_rst
 .. tip::
-    We recommend to use the `model classes with flexible heads <#models-with-flexible-heads>`_ whenever possible.
+    We recommend to use the `AdapterModel classes <#adaptermodel-classes>`_ whenever possible. 
     They have been created specifically for working with adapters and provide more flexibility.
 ```
 
-```eval_rst
-.. important::
-    Although the two prediction head implementations serve the same use case, their weights are *not* directly compatible, i.e. you cannot load a head created with ``AutoModelWithHeds`` into a model of type ``AutoModelForSequenceClassification``.
-    There is however an `automatic conversion to model classes with flexible heads <#automatic-conversion>`_.
-```
+## AdapterModel classes
 
-## Models with flexible heads
+The AdapterModel classes provided by `adapter-transformers` allow a flexible configuration of prediction heads on top of a pre-trained language model.
 
-To allow for prediction heads to be configured in a flexible way on top of a pre-trained language model, `adapter-transformers` provides a new line of model classes.
-These classes follow the naming schema `<model_class>WithHeads` and are available for all model classes supporting adapters. Let's see how they work:
-
-First, we load pre-trained model from HuggingFace:
+First, we load pre-trained model from the HuggingFace Hub via the [`AutoAdapterModel`](classes/models/auto.html#transformers.adapters.AutoAdapterModel) class:
 ```python
-model = BertModelWithHeads.from_pretrained("bert-base-uncased")
+model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 ```
 
-Although we use the class `BertModelWithHeads`, this model doesn't have any heads yet. We add a new one in the next step:
+By default, this model doesn't have any heads yet. We add a new one in the next step:
 ```python
 model.add_classification_head("mrpc", num_labels=2)
 ```
 The line above adds a binary sequence classification head on top of our model.
 As this head is named, we could add multiple other heads with different names to the same model.
 This is especially useful if used together with matching adapter modules.
-For more about the different head types and the configuration options, refer to the class references of the respective model classes, e.g. [BertModelWithHeads](classes/models/bert.html#transformers.BertModelWithHeads).
+To learn more about the different head types and the configuration options, please refer to the class references of the respective model classes, e.g. [`BertAdapterModel`](classes/models/bert.html#transformers.adapters.BertAdapterModel).
 
 Now, of course, we would like to train our classification head together with an adapter, so let's add one:
 ```python
@@ -49,9 +42,10 @@ At this point, we can start to [train our setup](training.md).
     The ``set_active_adapters()`` will search for an adapter and a prediction head with the given name to be activated.
     Alternatively, prediction heads can also be activated explicitly (i.e. without adapter modules).
     These three options are possible (in order of priority when multiple are specified):
-        1. If ``head`` is passed to the forward call, the head with the given name is used.
-        2. If the forward call is executed within an ``AdapterSetup`` context, the head configuration is read from the context.
-        3. If the ``active_head`` property is set, the head configuration is read from there.
+
+    1. If ``head`` is passed to the forward call, the head with the given name is used.
+    2. If the forward call is executed within an ``AdapterSetup`` context, the head configuration is read from the context.
+    3. If the ``active_head`` property is set, the head configuration is read from there.
 ```
 
 After training has completed, we can save our whole setup (adapter module _and_ prediction head), with a single call:
@@ -59,8 +53,8 @@ After training has completed, we can save our whole setup (adapter module _and_
 model.save_adapter("/path/to/dir", "mrpc", with_head=True)
 ```
 
-Now, we just have to [share our work with the world](contributing.html#add-your-pre-trained-adapter).
-After we published our adapter together with its head in the Hub, anyone else can load both adapter and head by using the same model class.
+Now, you just have to [share your work with the world](contributing.html#add-your-pre-trained-adapter).
+After you published our adapter together with its head in the Hub, anyone else can load both adapter and head by using the same model class.
 
 Alternatively, we can also save and load the prediction head separately from an adapter module:
 
@@ -77,7 +71,7 @@ Lastly, it's also possible to delete an added head again:
 model.delete_head("mrpc")
 ```
 
-## Model with static heads (HuggingFace heads)
+## Model classes with static heads (HuggingFace Transformers)
 
 The `transformers` library provides strongly typed model classes with heads for various different tasks (e.g. `RobertaForSequenceClassification`, `AutoModelForMultipleChoice` ...).
 If an adapter module is trained with one these out-of-the-box classes, it is encouraged to also distribute the prediction head weights together with the adapter weights.
@@ -93,14 +87,20 @@ In case the classes match, our prediction head weights will be automatically loa
 
 ## Automatic conversion 
 
-Beginning with v2.1 of `adapter-transformers`, it is possible to load static heads, e.g. created with `AutoModelForSequenceClassification`, into model classes with flexible heads, e.g. `AutoModelWithHeads`.
+```eval_rst
+.. important::
+    Although the two prediction head implementations serve the same use case, their weights are *not* directly compatible, i.e. you cannot load a head created with ``AutoAdapterModel`` into a model of type ``AutoModelForSequenceClassification``.
+    There is however an automatic conversion to model classes with flexible heads.
+```
+
+Beginning with v2.1 of `adapter-transformers`, it is possible to load static heads, e.g. created with `AutoModelForSequenceClassification`, into model classes with flexible heads, e.g. `AutoAdapterModel`.
 The conversion of weights happens automatically during the call of `load_adapter()`, so no additional steps are needed:
 ```python
 static_head_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
 static_head_model.add_adapter("test")
 static_head_model.save_adapter(temp_dir, "test")
 
-flex_head_model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
+flex_head_model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 flex_head_model.load_adapter(temp_dir)
 
 assert "test" in flex_head_model.config.adapters
diff --git a/adapter_docs/quickstart.md b/adapter_docs/quickstart.md
index 6a10aa435..a0a7b74c7 100644
--- a/adapter_docs/quickstart.md
+++ b/adapter_docs/quickstart.md
@@ -21,31 +21,29 @@ The following example shows the usage of a basic pre-trained transformer model w
 Our goal here is to predict the sentiment of a given sentence.
 
 We use BERT in this example, so we first load a pre-trained `BertTokenizer` to encode the input sentence and a pre-trained
-`BertModel` from HuggingFace:
+`bert-base-uncased` checkpoint from HuggingFace's Model Hub using the [`BertAdapterModel`](classes/models/bert.html#transformers.adapters.BertAdapterModel) class:
 
 ```python
 import torch
-from transformers import BertTokenizer, BertForSequenceClassification
+from transformers import BertTokenizer
+from transformers.adapters import BertAdapterModel
 
-# output more information
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# load pre-trained BERT tokenizer from Huggingface
+# Load pre-trained BERT tokenizer from Huggingface.
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
-# tokenize an input sentence
+# An input sentence.
 sentence = "It's also, clearly, great fun."
 
-# convert input tokens to indices and create PyTorch input tensor
-input_tensor = torch.tensor([tokenizer.encode(sentence)])
+# Tokenize the input sentence and create a PyTorch input tensor.
+input_data = tokenizer(sentence, return_tensors='pytorch')
 
-# load pre-trained BERT model from Huggingface
-# the `BertForSequenceClassification` class includes a prediction head for sequence classification
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+# Load pre-trained BERT model from HuggingFace Hub.
+# The `BertAdapterModel` class is specifically designed for working with adapters.
+# It can be used with different prediction heads.
+model = BertAdapterModel.from_pretrained('bert-base-uncased')
 ```
 
-Having loaded the model, we now add a pre-trained task adapter that is useful to our task from Adapter Hub.
+Having loaded the model, we now add a pre-trained task adapter that is useful to our task from AdapterHub.
 As we're doing sentiment classification, we use [an adapter trained on the SST-2 dataset](https://adapterhub.ml/adapters/ukp/bert-base-uncased_sentiment_sst-2_pfeiffer/) in this case.
 The task prediction head loaded together with the adapter gives us a class label for our sentence:
 
@@ -58,7 +56,7 @@ adapter_name = model.load_adapter('sst-2@ukp', config='pfeiffer')
 model.set_active_adapters(adapter_name)
 
 # predict output tensor
-outputs = model(input_tensor)
+outputs = model(**input_data)
 
 # retrieve the predicted class label
 predicted = torch.argmax(outputs[0]).item()
@@ -74,7 +72,7 @@ model.save_pretrained('./path/to/model/directory/')
 model.save_adapter('./path/to/adapter/directory/', 'sst-2')
 
 # load model
-model = BertModel.from_pretrained('./path/to/model/directory/')
+model = AutoAdapterModel.from_pretrained('./path/to/model/directory/')
 model.load_adapter('./path/to/adapter/directory/')
 ```
 
diff --git a/adapter_docs/training.md b/adapter_docs/training.md
index f94a7fcca..41f45e451 100644
--- a/adapter_docs/training.md
+++ b/adapter_docs/training.md
@@ -16,11 +16,11 @@ pip install -r ./examples/<your_examples_folder>/requirements.txt
 Training a task adapter module on a dataset only requires minor modifications from training the full model.
 Suppose we have an existing script for training a Transformer model, here we will use HuggingFace's [run_glue.py](https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/text-classification/run_glue.py) example script for training on the GLUE dataset.
 
-In our example, we replaced the built-in `AutoModelForSequenceClassification` class with the `AutoModelWithHeads` class introduced by `adapter-transformers` (learn more about prediction heads [here](prediction_heads.md)).
+In our example, we replaced the built-in `AutoModelForSequenceClassification` class with the `AutoAdapterModel` class introduced by `adapter-transformers` (learn more about prediction heads [here](prediction_heads.md)).
 Therefore, the model instantiation changed to:
 
 ```python
-model = AutoModelWithHeads.from_pretrained(
+model = AutoAdapterModel.from_pretrained(
         model_args.model_name_or_path,
         config=config,
 )
diff --git a/adding_adapters_to_a_model.md b/adding_adapters_to_a_model.md
index a9f35d26f..a2367ca0f 100644
--- a/adding_adapters_to_a_model.md
+++ b/adding_adapters_to_a_model.md
@@ -14,31 +14,39 @@ Now we go through the integration of adapters into an existing model architectur
 
 ## Implementation
 
-❓ Each model architecture with adapter support has a main `<model_type>.py` module in `src/transformers/adapters/models` (e.g. `src/transformers/adapters/models/distilbert.py` for `modeling_distilbert.py`) that provides the required adapter mixins for each modeling component (e.g. there is a `DistilBertTransfomerBlockAdaptersMixin` for the `TransformerBlock` of DistilBERT etc.).
-This is the central module to implement.
+### Integration into model implementation
+
+❓ Adding adapter support to an existing model architecture requires modifying a few parts of the model forward pass logic. These changes have to be made directly in the respective `modeling_<model_type>.py` class.
+Additionally, a few adapter mixins need to be applied to the respective Transformer module classes to provide the adapter implementations to a model.
+For this purpose, there typically exists a module `src/transformers/adapters/mixins/<model_type>.py`.
 
 **📝 Steps**
 
-- Add a new `<model_type>.py` module for your architecture in `src/transformers/adapters/models` (or reuse an existing if possible).
-    - There usually should be one mixin that derives from `AdapterLayerBaseMixin` or has it as a child module.
-    - The mixin for the whole base model class (e.g. `BertModel`) should derive from `ModelAdaptersMixin` and (if possible) `InvertibleAdaptersMixin`. Make sure to implement the abstract methods these mixins might define.
+- Add a new `<model_type>.py` module for your architecture in `src/transformers/adapters/mixins` (or reuse an existing if possible).
+    - There usually exists a mixin on the Transformer layer level that derives that holds modules for adapter layers.
+    - The mixin for the whole base model class (e.g. `BertModel`) should derive from `ModelAdaptersMixin` and (if possible) `InvertibleAdaptersMixin`. This mixin should at least implement the `iter_layers()` method but might require additional modifications depending on the architecture.
     - Have a look at existing examples, e.g. `distilbert.py`, `bert.py`.
-- Implement the mixins on the modeling classes (`modeling_<model_type>.py`).
-    - Make sure the calls to `adapters_forward()` are added in the right places.
+- Implement the mixins and the required modifications on the modeling classes (`modeling_<model_type>.py`).
+    - Make sure the calls to `adapter_layer_forward()` are added in the right places.
     - The base model class (e.g. `BertModel`) should implement the mixin derived from `ModelAdaptersMixin` you created previously.
     - The model classes with heads (e.g. `BertForSequenceClassification`) should directly implement `ModelWithHeadsAdaptersMixin`.
+    - To additionally support Prefix Tuning, it's necessary to apply the forward call to the `PrefixTuningShim` module in the respective attention layer.
+    - Again, have a look at existing implementations, e.g. `modeling_distilbert.py` or `modeling_bart.py`.
 - Add the mixin for config classes, `ModelConfigAdaptersMixin`, to the model configuration class in `configuration_<model_type>`.
     - There are some naming differences on the config attributes of different model architectures. The adapter implementation requires some additional attributes with a specific name to be available. These currently are `hidden_dropout_prob` and `attention_probs_dropout_prob` as in the `BertConfig` class.
 
-❓ Adapter-supporting architectures have a new model class `<model_type>ModelWithHeads`.
-These classes allow flexible adding of and switching between multiple prediction heads of different types.
+### `...AdapterModel` class
+
+❓ Adapter-supporting architectures should provide a new model class `<model_type>AdapterModel`.
+This class allows flexible adding of and switching between multiple prediction heads of different types.
 
 **📝 Steps**
 
-- In `modeling_<model_type>.py`, add a new `<model_type>ModelWithHeads` class.
-    - This class should implement a mixin (in `src/transformers/adapters/models/<model_type>.py`) which derives from `ModelWithFlexibleHeadsAdaptersMixin`
-    - In the mixin, add methods for those prediction heads that make sense for the new model architecture.
-- Add `<model_type>ModelWithHeads` to the `MODEL_WITH_HEADS_MAPPING` mapping in `modeling_auto.py` and to `__init__.py`.
+- In `src/transformers/adapters/models`, add a new `<model_type>.py` file.
+    - This module should implement the `<model_type>AdapterModel` class, deriving from `ModelWithFlexibleHeadsAdaptersMixin` and `<model_type>PreTrainedModel`.
+    - In the model class, add methods for those prediction heads that make sense for the new model architecture.
+    - Again, have a look at existing implementations, e.g. `bert.py`. Note that the `<model_type>ModelWithHeads` classes in existing modules are kept for backwards compatibility and are not needed for newly added architectures.
+- Add `<model_type>AdapterModel` to the `ADAPTER_MODEL_MAPPING_NAMES` mapping in `src/transformers/adapters/models/auto.py` and to `src/transformers/adapters/__init__.py`.
 
 ### Additional (optional) implementation steps
 
@@ -47,13 +55,14 @@ These classes allow flexible adding of and switching between multiple prediction
 
 ## Testing
 
-❓ In addition to the general HuggingFace model tests, there are adapter-specific test cases (usually starting with `test_adapter_`).
+❓ In addition to the general HuggingFace model tests, there are adapter-specific test cases. All tests are executed from the `tests_adapters` folder.
 
 **📝 Steps**
 
-- Add a new `<model_type>AdapterTest` class in `test_adapter.py` similar to the existing classes (e.g. `BertAdapterTest`).
-- Add `<model_type>ModelWithHeads` to `test_modeling_<model_type>.py`.
-- Insert `test_modeling_<model_type>` into the list of tested modules in `utils/run_tests.py`.
+- Add a new `test_<model_type>.py` module in `tests_adapters`. This module typically holds three test classes:
+    - `<model_type>AdapterModelTest` derives directly from HuggingFace's existing model test class `<model_type>ModelTest` and adds `<model_type>AdapterModel` as class to test.
+    - `<model_type>AdapterModelTest` derives from a collection of test mixins that hold various adapter tests (depending on the implementation).
+    - (optionally) `<model_type>ClassConversionTest` runs tests for correct class conversion if conversion of prediction heads is implemented.
 - Append `<model_type>` to the list in `check_adapters.py`.
 
 ## Documentation
@@ -62,7 +71,7 @@ These classes allow flexible adding of and switching between multiple prediction
 
 **📝 Steps**
 
-- Add `adapter_docs/classes/models/<model_type>.rst` (oriented at the doc file in the HF docs, make sure to include `<model_type>ModelWithHeads` and the HF notice). 
+- Add `adapter_docs/classes/models/<model_type>.rst` (oriented at the doc file in the HF docs). Make sure to include `<model_type>AdapterModel` autodoc. 
 Finally, list the file in `index.rst`.
 
 ## Training Example Adapters
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ee3a1dcf1..e92509dc5 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -598,7 +598,6 @@
             "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_MAPPING",
-            "MODEL_WITH_HEADS_MAPPING",
             "MODEL_WITH_LM_HEAD_MAPPING",
             "AutoModel",
             "AutoModelForAudioClassification",
@@ -617,7 +616,6 @@
             "AutoModelForSpeechSeq2Seq",
             "AutoModelForTableQuestionAnswering",
             "AutoModelForTokenClassification",
-            "AutoModelWithHeads",
             "AutoModelWithLMHead",
         ]
     )
@@ -629,7 +627,6 @@
             "BartForQuestionAnswering",
             "BartForSequenceClassification",
             "BartModel",
-            "BartModelWithHeads",
             "BartPretrainedModel",
             "PretrainedBartModel",
         ]
@@ -656,7 +653,6 @@
             "BertLayer",
             "BertLMHeadModel",
             "BertModel",
-            "BertModelWithHeads",
             "BertPreTrainedModel",
             "load_tf_weights_in_bert",
         ]
@@ -811,7 +807,6 @@
             "DistilBertForSequenceClassification",
             "DistilBertForTokenClassification",
             "DistilBertModel",
-            "DistilBertModelWithHeads",
             "DistilBertPreTrainedModel",
         ]
     )
@@ -895,7 +890,6 @@
             "GPT2ForTokenClassification",
             "GPT2LMHeadModel",
             "GPT2Model",
-            "GPT2ModelWithHeads",
             "GPT2PreTrainedModel",
             "load_tf_weights_in_gpt2",
         ]
@@ -1020,7 +1014,6 @@
             "MBartForQuestionAnswering",
             "MBartForSequenceClassification",
             "MBartModel",
-            "MBartModelWithHeads",
             "MBartPreTrainedModel",
         ]
     )
@@ -1139,7 +1132,6 @@
             "RobertaForSequenceClassification",
             "RobertaForTokenClassification",
             "RobertaModel",
-            "RobertaModelWithHeads",
             "RobertaPreTrainedModel",
         ]
     )
@@ -1225,7 +1217,6 @@
             "T5EncoderModel",
             "T5ForConditionalGeneration",
             "T5Model",
-            "T5ModelWithHeads",
             "T5PreTrainedModel",
             "load_tf_weights_in_t5",
         ]
@@ -1330,7 +1321,6 @@
             "XLMRobertaForSequenceClassification",
             "XLMRobertaForTokenClassification",
             "XLMRobertaModel",
-            "XLMRobertaModelWithHeads",
         ]
     )
     _import_structure["models.xlnet"].extend(
@@ -1368,57 +1358,58 @@
 
 # Adapters
 if is_torch_available():
-    _import_structure["adapters.configuration"] = [
+    _import_structure["adapters"] = [
+        "ADAPTER_CACHE",
         "ADAPTER_CONFIG_MAP",
         "ADAPTERFUSION_CONFIG_MAP",
+        "ADAPTER_MODEL_MAPPING",
         "DEFAULT_ADAPTER_CONFIG",
         "DEFAULT_ADAPTERFUSION_CONFIG",
+        "MODEL_WITH_HEADS_MAPPING",
+        "AdapterArguments",
         "AdapterConfig",
         "AdapterFusionConfig",
+        "AdapterInfo",
+        "AdapterLayer",
+        "AdapterSetup",
+        "AdapterTrainer",
+        "AdapterType",
+        "AutoAdapterModel",
+        "AutoModelWithHeads",
+        "BartAdapterModel",
+        "BartModelWithHeads",
+        "BertAdapterModel",
+        "BertModelWithHeads",
+        "DistilBertAdapterModel",
+        "DistilBertModelWithHeads",
         "DynamicAdapterFusionConfig",
+        "ForwardContext",
+        "GPT2AdapterModel",
+        "GPT2ModelWithHeads",
         "HoulsbyConfig",
         "HoulsbyInvConfig",
-        "ModelAdaptersConfig",
-        "PfeifferConfig",
-        "PfeifferInvConfig",
-        "StaticAdapterFusionConfig",
-    ]
-    _import_structure["adapters.context"] = ["AdapterSetup"]
-    _import_structure["adapters.heads"] = ["ModelWithFlexibleHeadsAdaptersMixin"]
-    _import_structure["adapters.layer"] = ["AdapterLayer"]
-    _import_structure["adapters.loading"] = [
-        "AdapterFusionLoader",
-        "AdapterLoader",
-        "PredictionHeadLoader",
-        "WeightsLoader",
-        "WeightsLoaderHelper",
-    ]
-    _import_structure["adapters.model_mixin"] = [
         "InvertibleAdaptersMixin",
+        "MBartAdapterModel",
+        "MBartModelWithHeads",
+        "ModelAdaptersConfig",
         "ModelAdaptersMixin",
         "ModelConfigAdaptersMixin",
+        "ModelWithFlexibleHeadsAdaptersMixin",
         "ModelWithHeadsAdaptersMixin",
-    ]
-    _import_structure["adapters.trainer"] = [
-        "AdapterTrainer",
-        "Seq2SeqAdapterTrainer",
-    ]
-    _import_structure["adapters.training"] = [
-        "AdapterArguments",
         "MultiLingAdapterArguments",
-    ]
-    _import_structure["adapters.utils"] = [
-        "ADAPTER_CACHE",
-        "ADAPTER_HUB_INDEX_FILE",
-        "ADAPTER_HUB_URL",
-        "AdapterInfo",
-        "AdapterType",
+        "PfeifferConfig",
+        "PfeifferInvConfig",
+        "RobertaAdapterModel",
+        "RobertaModelWithHeads",
+        "Seq2SeqAdapterTrainer",
+        "StaticAdapterFusionConfig",
+        "T5AdapterModel",
+        "T5ModelWithHeads",
+        "XLMRobertaAdapterModel",
+        "XLMRobertaModelWithHeads",
         "get_adapter_config_hash",
         "get_adapter_info",
         "list_adapters",
-        "pull_from_hub",
-        "resolve_adapter_config",
-        "resolve_adapter_path",
     ]
 
 # TensorFlow-backed objects
@@ -2509,7 +2500,6 @@
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_MAPPING,
-            MODEL_WITH_HEADS_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
             AutoModelForAudioClassification,
@@ -2528,7 +2518,6 @@
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
-            AutoModelWithHeads,
             AutoModelWithLMHead,
         )
         from .models.bart import (
@@ -2538,7 +2527,6 @@
             BartForQuestionAnswering,
             BartForSequenceClassification,
             BartModel,
-            BartModelWithHeads,
             BartPretrainedModel,
             PretrainedBartModel,
         )
@@ -2561,7 +2549,6 @@
             BertLayer,
             BertLMHeadModel,
             BertModel,
-            BertModelWithHeads,
             BertPreTrainedModel,
             load_tf_weights_in_bert,
         )
@@ -2688,7 +2675,6 @@
             DistilBertForSequenceClassification,
             DistilBertForTokenClassification,
             DistilBertModel,
-            DistilBertModelWithHeads,
             DistilBertPreTrainedModel,
         )
         from .models.dpr import (
@@ -2760,7 +2746,6 @@
             GPT2ForTokenClassification,
             GPT2LMHeadModel,
             GPT2Model,
-            GPT2ModelWithHeads,
             GPT2PreTrainedModel,
             load_tf_weights_in_gpt2,
         )
@@ -2861,7 +2846,6 @@
             MBartForQuestionAnswering,
             MBartForSequenceClassification,
             MBartModel,
-            MBartModelWithHeads,
             MBartPreTrainedModel,
         )
         from .models.megatron_bert import (
@@ -2963,7 +2947,6 @@
             RobertaForSequenceClassification,
             RobertaForTokenClassification,
             RobertaModel,
-            RobertaModelWithHeads,
             RobertaPreTrainedModel,
         )
         from .models.roformer import (
@@ -3033,7 +3016,6 @@
             T5EncoderModel,
             T5ForConditionalGeneration,
             T5Model,
-            T5ModelWithHeads,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
         )
@@ -3118,7 +3100,6 @@
             XLMRobertaForSequenceClassification,
             XLMRobertaForTokenClassification,
             XLMRobertaModel,
-            XLMRobertaModelWithHeads,
         )
         from .models.xlnet import (
             XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -3155,51 +3136,58 @@
 
     # Adapters
     if is_torch_available():
-        from .adapters.config import (
+        from .adapters import (
+            ADAPTER_CACHE,
             ADAPTER_CONFIG_MAP,
+            ADAPTER_MODEL_MAPPING,
             ADAPTERFUSION_CONFIG_MAP,
             DEFAULT_ADAPTER_CONFIG,
             DEFAULT_ADAPTERFUSION_CONFIG,
+            MODEL_WITH_HEADS_MAPPING,
+            AdapterArguments,
             AdapterConfig,
             AdapterFusionConfig,
+            AdapterInfo,
+            AdapterLayer,
+            AdapterSetup,
+            AdapterTrainer,
+            AdapterType,
+            AutoAdapterModel,
+            AutoModelWithHeads,
+            BartAdapterModel,
+            BartModelWithHeads,
+            BertAdapterModel,
+            BertModelWithHeads,
+            DistilBertAdapterModel,
+            DistilBertModelWithHeads,
             DynamicAdapterFusionConfig,
+            ForwardContext,
+            GPT2AdapterModel,
+            GPT2ModelWithHeads,
             HoulsbyConfig,
             HoulsbyInvConfig,
-            ModelAdaptersConfig,
-            PfeifferConfig,
-            PfeifferInvConfig,
-            StaticAdapterFusionConfig,
-        )
-        from .adapters.context import AdapterSetup
-        from .adapters.heads import ModelWithFlexibleHeadsAdaptersMixin
-        from .adapters.layer import AdapterLayer
-        from .adapters.loading import (
-            AdapterFusionLoader,
-            AdapterLoader,
-            PredictionHeadLoader,
-            WeightsLoader,
-            WeightsLoaderHelper,
-        )
-        from .adapters.model_mixin import (
             InvertibleAdaptersMixin,
+            MBartAdapterModel,
+            MBartModelWithHeads,
+            ModelAdaptersConfig,
             ModelAdaptersMixin,
             ModelConfigAdaptersMixin,
+            ModelWithFlexibleHeadsAdaptersMixin,
             ModelWithHeadsAdaptersMixin,
-        )
-        from .adapters.trainer import AdapterTrainer, Seq2SeqAdapterTrainer
-        from .adapters.training import AdapterArguments, MultiLingAdapterArguments
-        from .adapters.utils import (
-            ADAPTER_CACHE,
-            ADAPTER_HUB_INDEX_FILE,
-            ADAPTER_HUB_URL,
-            AdapterInfo,
-            AdapterType,
+            MultiLingAdapterArguments,
+            PfeifferConfig,
+            PfeifferInvConfig,
+            RobertaAdapterModel,
+            RobertaModelWithHeads,
+            Seq2SeqAdapterTrainer,
+            StaticAdapterFusionConfig,
+            T5AdapterModel,
+            T5ModelWithHeads,
+            XLMRobertaAdapterModel,
+            XLMRobertaModelWithHeads,
             get_adapter_config_hash,
             get_adapter_info,
             list_adapters,
-            pull_from_hub,
-            resolve_adapter_config,
-            resolve_adapter_path,
         )
 
     # TensorFlow
@@ -3711,6 +3699,7 @@
         extra_objects={"__version__": __version__, "__adapters_version__": __adapters_version__},
     )
 
+
 if not is_tf_available() and not is_torch_available() and not is_flax_available():
     logger.warning(
         "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. "
diff --git a/src/transformers/adapters/__init__.py b/src/transformers/adapters/__init__.py
index e69de29bb..fe76f34f8 100644
--- a/src/transformers/adapters/__init__.py
+++ b/src/transformers/adapters/__init__.py
@@ -0,0 +1,202 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The Adapter-Hub Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..file_utils import _LazyModule
+
+
+_import_structure = {
+    "composition": [
+        "AdapterCompositionBlock",
+        "BatchSplit",
+        "Fuse",
+        "Parallel",
+        "Split",
+        "Stack",
+        "parse_composition",
+        "validate_composition",
+    ],
+    "configuration": [
+        "ADAPTER_CONFIG_MAP",
+        "ADAPTERFUSION_CONFIG_MAP",
+        "DEFAULT_ADAPTER_CONFIG",
+        "DEFAULT_ADAPTERFUSION_CONFIG",
+        "AdapterConfig",
+        "AdapterFusionConfig",
+        "DynamicAdapterFusionConfig",
+        "HoulsbyConfig",
+        "HoulsbyInvConfig",
+        "ModelAdaptersConfig",
+        "PfeifferConfig",
+        "PfeifferInvConfig",
+        "StaticAdapterFusionConfig",
+    ],
+    "context": [
+        "AdapterSetup",
+        "ForwardContext",
+    ],
+    "heads": [
+        "BertStyleMaskedLMHead",
+        "BiaffineParsingHead",
+        "CausalLMHead",
+        "ClassificationHead",
+        "DependencyParsingOutput",
+        "ModelWithFlexibleHeadsAdaptersMixin",
+        "MultiHeadOutput",
+        "MultiLabelClassificationHead",
+        "MultipleChoiceHead",
+        "PredictionHead",
+        "QuestionAnsweringHead",
+        "Seq2SeqLMHead",
+        "TaggingHead",
+    ],
+    "layer": ["AdapterLayer"],
+    "model_mixin": [
+        "InvertibleAdaptersMixin",
+        "ModelAdaptersMixin",
+        "ModelConfigAdaptersMixin",
+        "ModelWithHeadsAdaptersMixin",
+    ],
+    "models.auto": [
+        "ADAPTER_MODEL_MAPPING",
+        "MODEL_WITH_HEADS_MAPPING",
+        "AutoAdapterModel",
+        "AutoModelWithHeads",
+    ],
+    "models.bart": [
+        "BartAdapterModel",
+        "BartModelWithHeads",
+    ],
+    "models.bert": [
+        "BertAdapterModel",
+        "BertModelWithHeads",
+    ],
+    "models.distilbert": [
+        "DistilBertAdapterModel",
+        "DistilBertModelWithHeads",
+    ],
+    "models.gpt2": [
+        "GPT2AdapterModel",
+        "GPT2ModelWithHeads",
+    ],
+    "models.mbart": [
+        "MBartAdapterModel",
+        "MBartModelWithHeads",
+    ],
+    "models.roberta": [
+        "RobertaAdapterModel",
+        "RobertaModelWithHeads",
+    ],
+    "models.t5": [
+        "T5AdapterModel",
+        "T5ModelWithHeads",
+    ],
+    "models.xlm_roberta": [
+        "XLMRobertaAdapterModel",
+        "XLMRobertaModelWithHeads",
+    ],
+    "trainer": ["AdapterTrainer", "Seq2SeqAdapterTrainer"],
+    "training": [
+        "AdapterArguments",
+        "MultiLingAdapterArguments",
+    ],
+    "utils": [
+        "ADAPTER_CACHE",
+        "AdapterInfo",
+        "AdapterType",
+        "get_adapter_config_hash",
+        "get_adapter_info",
+        "list_adapters",
+    ],
+}
+
+
+if TYPE_CHECKING:
+    from .composition import (
+        AdapterCompositionBlock,
+        BatchSplit,
+        Fuse,
+        Parallel,
+        Split,
+        Stack,
+        parse_composition,
+        validate_composition,
+    )
+    from .configuration import (
+        ADAPTER_CONFIG_MAP,
+        ADAPTERFUSION_CONFIG_MAP,
+        DEFAULT_ADAPTER_CONFIG,
+        DEFAULT_ADAPTERFUSION_CONFIG,
+        AdapterConfig,
+        AdapterFusionConfig,
+        DynamicAdapterFusionConfig,
+        HoulsbyConfig,
+        HoulsbyInvConfig,
+        ModelAdaptersConfig,
+        PfeifferConfig,
+        PfeifferInvConfig,
+        StaticAdapterFusionConfig,
+    )
+    from .context import AdapterSetup, ForwardContext
+    from .heads import (
+        BertStyleMaskedLMHead,
+        BiaffineParsingHead,
+        CausalLMHead,
+        ClassificationHead,
+        DependencyParsingOutput,
+        ModelWithFlexibleHeadsAdaptersMixin,
+        MultiHeadOutput,
+        MultiLabelClassificationHead,
+        MultipleChoiceHead,
+        PredictionHead,
+        QuestionAnsweringHead,
+        Seq2SeqLMHead,
+        TaggingHead,
+    )
+    from .layer import AdapterLayer
+    from .model_mixin import (
+        InvertibleAdaptersMixin,
+        ModelAdaptersMixin,
+        ModelConfigAdaptersMixin,
+        ModelWithHeadsAdaptersMixin,
+    )
+    from .models.auto import ADAPTER_MODEL_MAPPING, MODEL_WITH_HEADS_MAPPING, AutoAdapterModel, AutoModelWithHeads
+    from .models.bart import BartAdapterModel, BartModelWithHeads
+    from .models.bert import BertAdapterModel, BertModelWithHeads
+    from .models.distilbert import DistilBertAdapterModel, DistilBertModelWithHeads
+    from .models.gpt2 import GPT2AdapterModel, GPT2ModelWithHeads
+    from .models.mbart import MBartAdapterModel, MBartModelWithHeads
+    from .models.roberta import RobertaAdapterModel, RobertaModelWithHeads
+    from .models.t5 import T5AdapterModel, T5ModelWithHeads
+    from .models.xlm_roberta import XLMRobertaAdapterModel, XLMRobertaModelWithHeads
+    from .trainer import AdapterTrainer, Seq2SeqAdapterTrainer
+    from .training import AdapterArguments, MultiLingAdapterArguments
+    from .utils import (
+        ADAPTER_CACHE,
+        AdapterInfo,
+        AdapterType,
+        get_adapter_config_hash,
+        get_adapter_info,
+        list_adapters,
+    )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/adapters/hub_mixin.py b/src/transformers/adapters/hub_mixin.py
index b1df68b7f..29babd6cf 100644
--- a/src/transformers/adapters/hub_mixin.py
+++ b/src/transformers/adapters/hub_mixin.py
@@ -34,9 +34,9 @@
 Now, the adapter can be loaded and activated like this:
 
 ```python
-from transformers import AutoModelWithHeads
+from transformers import AutoAdapterModel
 
-model = AutoModelWithHeads.from_pretrained("{model_name}")
+model = AutoAdapterModel.from_pretrained("{model_name}")
 adapter_name = model.load_adapter("{adapter_repo_name}", source="hf", set_active=True)
 ```
 
diff --git a/src/transformers/adapters/loading.py b/src/transformers/adapters/loading.py
index de54f9253..2f17ea36b 100644
--- a/src/transformers/adapters/loading.py
+++ b/src/transformers/adapters/loading.py
@@ -677,6 +677,16 @@ def load(self, save_directory, load_as=None, loading_info=None, **kwargs):
                 if self.model.__class__.__name__ == config["model_class"]:
                     head_name = load_as or config["name"]
                     head_config = config["config"]
+                elif config["model_class"].endswith("ModelWithHeads"):
+                    this_class = self.model.__class__.__name__.replace("AdapterModel", "")
+                    other_class = config["model_class"].replace("ModelWithHeads", "")
+                    if this_class == other_class:
+                        head_name = load_as or config["name"]
+                        head_config = config["config"]
+                    else:
+                        raise ValueError(
+                            f"Cannot automatically convert prediction head of model class {config['model_class']} to flex head."
+                        )
                 # try to convert a static head to a flex head
                 elif self.convert_to_flex_head and config["model_class"] in STATIC_TO_FLEX_HEAD_MAP:
                     head_name = kwargs.pop("main_load_name", load_as)
diff --git a/src/transformers/adapters/mixins/__init__.py b/src/transformers/adapters/mixins/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/transformers/adapters/mixins/bart.py b/src/transformers/adapters/mixins/bart.py
new file mode 100644
index 000000000..d6c0894fc
--- /dev/null
+++ b/src/transformers/adapters/mixins/bart.py
@@ -0,0 +1,50 @@
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+class BartEncoderLayerAdaptersMixin:
+    """Adds adapters to the BartEncoderLayer module of BART."""
+
+    def _init_adapter_modules(self):
+        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
+        self.output_adapters = AdapterLayer("output_adapter", self.config)
+        self.attention_adapters._init_adapter_modules()
+        self.output_adapters._init_adapter_modules()
+
+
+class BartDecoderLayerAdaptersMixin(BartEncoderLayerAdaptersMixin):
+    """Adds adapters to the BartDecoderLayer module of BART."""
+
+    def _init_adapter_modules(self):
+        super()._init_adapter_modules()
+        self.cross_attention_adapters = AdapterLayer("cross_adapter", self.config)
+        self.cross_attention_adapters._init_adapter_modules()
+
+
+class BartModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    """Adds adapters to the BartModel class."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        if hasattr(self, "encoder"):
+            for i, layer in enumerate(self.encoder.layers):
+                yield i, layer
+            for i, layer in enumerate(self.decoder.layers, start=len(self.encoder.layers)):
+                yield i, layer
+        else:
+            for i, layer in enumerate(self.decoder.layers):
+                yield i, layer
+
+    def _init_adapter_modules(self):
+        if hasattr(self, "encoder"):
+            # In BART, the invertible adapters are implemented by the encoder module.
+            # Therefore, relay mixin calls to the encoder here.
+            self.invertible_adapters = self.encoder.invertible_adapters
+            self.add_invertible_adapter = self.encoder.add_invertible_adapter
+            self.get_invertible_adapter = self.encoder.get_invertible_adapter
+            self.enable_invertible_adapters = self.encoder.enable_invertible_adapters
+            self.invertible_adapters_forward = self.encoder.invertible_adapters_forward
+        super()._init_adapter_modules()
diff --git a/src/transformers/adapters/mixins/bert.py b/src/transformers/adapters/mixins/bert.py
new file mode 100644
index 000000000..3dba2957b
--- /dev/null
+++ b/src/transformers/adapters/mixins/bert.py
@@ -0,0 +1,34 @@
+import logging
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+logger = logging.getLogger(__name__)
+
+
+# For backwards compatibility, BertSelfOutput inherits directly from AdapterLayer
+class BertSelfOutputAdaptersMixin(AdapterLayer):
+    """Adds adapters to the BertSelfOutput module."""
+
+    def __init__(self):
+        super().__init__("mh_adapter", None)
+
+
+# For backwards compatibility, BertOutput inherits directly from AdapterLayer
+class BertOutputAdaptersMixin(AdapterLayer):
+    """Adds adapters to the BertOutput module."""
+
+    def __init__(self):
+        super().__init__("output_adapter", None)
+
+
+class BertModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    """Adds adapters to the BertModel module."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        for i, layer in enumerate(self.encoder.layer):
+            yield i, layer
diff --git a/src/transformers/adapters/mixins/distilbert.py b/src/transformers/adapters/mixins/distilbert.py
new file mode 100644
index 000000000..c3b431f1f
--- /dev/null
+++ b/src/transformers/adapters/mixins/distilbert.py
@@ -0,0 +1,24 @@
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+class DistilBertTransfomerBlockAdaptersMixin:
+    """Adds adapters to the TransformerBlock module of DistilBert."""
+
+    def _init_adapter_modules(self):
+        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
+        self.output_adapters = AdapterLayer("output_adapter", self.config)
+        self.attention_adapters._init_adapter_modules()
+        self.output_adapters._init_adapter_modules()
+
+
+class DistilBertModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    """Adds adapters to the DistilBert module."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        for i, layer in enumerate(self.transformer.layer):
+            yield i, layer
diff --git a/src/transformers/adapters/models/encoder_decoder.py b/src/transformers/adapters/mixins/encoder_decoder.py
similarity index 100%
rename from src/transformers/adapters/models/encoder_decoder.py
rename to src/transformers/adapters/mixins/encoder_decoder.py
diff --git a/src/transformers/adapters/mixins/gpt2.py b/src/transformers/adapters/mixins/gpt2.py
new file mode 100644
index 000000000..93cddbdde
--- /dev/null
+++ b/src/transformers/adapters/mixins/gpt2.py
@@ -0,0 +1,22 @@
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+class GPT2DecoderBlockAdaptersMixin:
+    """Adds adapters to the TransformerBlock module of DistilBert."""
+
+    def _init_adapter_modules(self):
+        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
+        self.output_adapters = AdapterLayer("output_adapter", self.config)
+        self.attention_adapters._init_adapter_modules()
+        self.output_adapters._init_adapter_modules()
+
+
+class GPT2ModelAdapterMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        for i, layer in enumerate(self.base_model.h):
+            yield i, layer
diff --git a/src/transformers/adapters/mixins/t5.py b/src/transformers/adapters/mixins/t5.py
new file mode 100644
index 000000000..bd1d8efdc
--- /dev/null
+++ b/src/transformers/adapters/mixins/t5.py
@@ -0,0 +1,47 @@
+from typing import Iterable, Tuple
+
+import torch.nn as nn
+
+from ..layer import AdapterLayer
+from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
+
+
+class T5SelfAttentionLayerAdaptersMixin(AdapterLayer):
+    def __init__(self):
+        super().__init__("mh_adapter", None)
+
+
+class T5CrossAttentionLayerAdaptersMixin(AdapterLayer):
+    def __init__(self):
+        super().__init__("cross_adapter", None)
+
+
+class T5FFLayerAdaptersMixin(AdapterLayer):
+    def __init__(self):
+        super().__init__("output_adapter", None)
+
+
+class T5ModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
+    """Adds adapters to the T5Model class."""
+
+    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
+        if hasattr(self, "encoder"):
+            for i, layer in enumerate(self.encoder.block):
+                yield i, layer
+            for i, layer in enumerate(self.decoder.block, start=len(self.encoder.block)):
+                yield i, layer
+        else:
+            for i, layer in enumerate(self.decoder.block):
+                yield i, layer
+
+    def _init_adapter_modules(self):
+        if hasattr(self, "encoder"):
+            # In T5, the invertible adapters are implemented by the encoder module.
+            # Therefore, relay mixin calls to the encoder here.
+            self.invertible_adapters = self.encoder.invertible_adapters
+            self.add_invertible_adapter = self.encoder.add_invertible_adapter
+            self.get_invertible_adapter = self.encoder.get_invertible_adapter
+            self.enable_invertible_adapters = self.encoder.enable_invertible_adapters
+            self.invertible_adapters_forward = self.encoder.invertible_adapters_forward
+            self.delete_invertible_adapter = self.encoder.delete_invertible_adapter
+        super()._init_adapter_modules()
diff --git a/src/transformers/adapters/models/auto.py b/src/transformers/adapters/models/auto.py
new file mode 100644
index 000000000..549c3aeef
--- /dev/null
+++ b/src/transformers/adapters/models/auto.py
@@ -0,0 +1,71 @@
+import warnings
+from collections import OrderedDict
+
+from ...models.auto.auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
+from ...models.auto.configuration_auto import CONFIG_MAPPING_NAMES
+
+
+# Make sure that children are placed before parents!
+ADAPTER_MODEL_MAPPING_NAMES = OrderedDict(
+    [
+        ("xlm-roberta", "XLMRobertaAdapterModel"),
+        ("roberta", "RobertaAdapterModel"),
+        ("bert", "BertAdapterModel"),
+        ("distilbert", "DistilBertAdapterModel"),
+        ("bart", "BartAdapterModel"),
+        ("mbart", "MBartAdapterModel"),
+        ("gpt2", "GPT2AdapterModel"),
+        ("t5", "T5AdapterModel"),
+    ]
+)
+MODEL_WITH_HEADS_MAPPING_NAMES = OrderedDict(
+    [
+        ("xlm-roberta", "XLMRobertaModelWithHeads"),
+        ("roberta", "RobertaModelWithHeads"),
+        ("bert", "BertModelWithHeads"),
+        ("distilbert", "DistilBertModelWithHeads"),
+        ("bart", "BartModelWithHeads"),
+        ("mbart", "MBartModelWithHeads"),
+        ("gpt2", "GPT2ModelWithHeads"),
+        ("t5", "T5ModelWithHeads"),
+    ]
+)
+
+ADAPTER_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, ADAPTER_MODEL_MAPPING_NAMES)
+MODEL_WITH_HEADS_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_HEADS_MAPPING_NAMES)
+
+
+class AutoAdapterModel(_BaseAutoModelClass):
+    _model_mapping = ADAPTER_MODEL_MAPPING
+
+
+AutoAdapterModel = auto_class_update(AutoAdapterModel, head_doc="adapters and flexible heads")
+
+
+class AutoModelWithHeads(_BaseAutoModelClass):
+    _model_mapping = MODEL_WITH_HEADS_MAPPING
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+AutoModelWithHeads = auto_class_update(AutoModelWithHeads, head_doc="flexible heads")
diff --git a/src/transformers/adapters/models/bart.py b/src/transformers/adapters/models/bart.py
index 590b57356..8e010d81b 100644
--- a/src/transformers/adapters/models/bart.py
+++ b/src/transformers/adapters/models/bart.py
@@ -1,7 +1,25 @@
-from typing import Iterable, Tuple
+import warnings
 
-import torch.nn as nn
+import torch
 
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.bart.modeling_bart import (
+    _CHECKPOINT_FOR_DOC,
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    BART_INPUTS_DOCSTRING,
+    BART_START_DOCSTRING,
+    BartConfig,
+    BartModel,
+    BartPretrainedModel,
+    shift_tokens_right,
+)
+from ..composition import adjust_tensors_for_parallel
 from ..heads import (
     ClassificationHead,
     ModelWithFlexibleHeadsAdaptersMixin,
@@ -9,58 +27,143 @@
     QuestionAnsweringHead,
     Seq2SeqLMHead,
 )
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-
 
-class BartEncoderLayerAdaptersMixin:
-    """Adds adapters to the BartEncoderLayer module of BART."""
 
-    def _init_adapter_modules(self):
-        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
-        self.output_adapters = AdapterLayer("output_adapter", self.config)
-        self.attention_adapters._init_adapter_modules()
-        self.output_adapters._init_adapter_modules()
+@add_start_docstrings(
+    "BART Model with the option to add multiple flexible prediction heads on top.", BART_START_DOCSTRING
+)
+class BartAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, BartPretrainedModel):
+    def __init__(self, config: BartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = BartModel(config)
 
+        self._init_head_modules()
 
-class BartDecoderLayerAdaptersMixin(BartEncoderLayerAdaptersMixin):
-    """Adds adapters to the BartDecoderLayer module of BART."""
+    def get_encoder(self):
+        return self.model.get_encoder()
 
-    def _init_adapter_modules(self):
-        super()._init_adapter_modules()
-        self.cross_attention_adapters = AdapterLayer("cross_adapter", self.config)
-        self.cross_attention_adapters._init_adapter_modules()
+    def get_decoder(self):
+        return self.model.get_decoder()
 
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        past_key_values=None,
+        head=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-class BartModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    """Adds adapters to the BartModel class."""
+        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
+            use_cache = False
 
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        if hasattr(self, "encoder"):
-            for i, layer in enumerate(self.encoder.layers):
-                yield i, layer
-            for i, layer in enumerate(self.decoder.layers, start=len(self.encoder.layers)):
-                yield i, layer
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            past_key_values=past_key_values,
+        )
+        # sequence classification based on last token in sequence
+        x = outputs[0]  # last hidden state
+        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
+            eos_mask = input_ids.eq(self.config.eos_token_id)
+            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
+            if len(torch.unique(eos_mask.sum(1))) > 1:
+                raise ValueError("All examples must have the same number of <eos> tokens.")
+            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
         else:
-            for i, layer in enumerate(self.decoder.layers):
-                yield i, layer
-
-    def _init_adapter_modules(self):
-        if hasattr(self, "encoder"):
-            # In BART, the invertible adapters are implemented by the encoder module.
-            # Therefore, relay mixin calls to the encoder here.
-            self.invertible_adapters = self.encoder.invertible_adapters
-            self.add_invertible_adapter = self.encoder.add_invertible_adapter
-            self.get_invertible_adapter = self.encoder.get_invertible_adapter
-            self.enable_invertible_adapters = self.encoder.enable_invertible_adapters
-            self.invertible_adapters_forward = self.encoder.invertible_adapters_forward
-        super()._init_adapter_modules()
-
-
-class BartModelHeadsMixin(ModelWithFlexibleHeadsAdaptersMixin):
-    """
-    Adds flexible heads to a BART model.
-    """
+            cls_representation = x
+
+        head_outputs = self.forward_head(
+            outputs,
+            head_name=head,
+            cls_output=cls_representation,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        return head_outputs
+
+    # Copied from BartForConditionalGeneration
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    # Copied from BartForConditionalGeneration
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    # Copied from BartForConditionalGeneration
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
 
     head_types = {
         "classification": ClassificationHead,
@@ -123,3 +226,37 @@ def add_seq2seq_lm_head(
         """
         head = Seq2SeqLMHead(self, head_name)
         self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class BartModelWithHeads(BartAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/bert.py b/src/transformers/adapters/models/bert.py
index 6fd51986f..31fbaad95 100644
--- a/src/transformers/adapters/models/bert.py
+++ b/src/transformers/adapters/models/bert.py
@@ -1,8 +1,21 @@
-import logging
-from typing import Iterable, Tuple
-
-import torch.nn as nn
+import warnings
 
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.bert.modeling_bert import (
+    _CHECKPOINT_FOR_DOC,
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    BERT_INPUTS_DOCSTRING,
+    BERT_START_DOCSTRING,
+    BertModel,
+    BertPreTrainedModel,
+)
+from ..context import AdapterSetup
 from ..heads import (
     BertStyleMaskedLMHead,
     BiaffineParsingHead,
@@ -14,41 +27,86 @@
     QuestionAnsweringHead,
     TaggingHead,
 )
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-
-
-logger = logging.getLogger(__name__)
-
 
-# For backwards compatibility, BertSelfOutput inherits directly from AdapterLayer
-class BertSelfOutputAdaptersMixin(AdapterLayer):
-    """Adds adapters to the BertSelfOutput module."""
-
-    def __init__(self):
-        super().__init__("mh_adapter", None)
 
+@add_start_docstrings(
+    """Bert Model transformer with the option to add multiple flexible heads on top.""",
+    BERT_START_DOCSTRING,
+)
+class BertAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
 
-# For backwards compatibility, BertOutput inherits directly from AdapterLayer
-class BertOutputAdaptersMixin(AdapterLayer):
-    """Adds adapters to the BertOutput module."""
+        self.bert = BertModel(config)
 
-    def __init__(self):
-        super().__init__("output_adapter", None)
+        self._init_head_modules()
 
+        self.init_weights()
 
-class BertModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    """Adds adapters to the BertModel module."""
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
 
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        for i, layer in enumerate(self.encoder.layer):
-            yield i, layer
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # BERT & RoBERTa return the pooled output as second item, we don't need that in these heads
+        if not return_dict:
+            head_inputs = (outputs[0],) + outputs[2:]
+        else:
+            head_inputs = outputs
+        pooled_output = outputs[1]
 
-class BertModelHeadsMixin(ModelWithFlexibleHeadsAdaptersMixin):
-    """
-    Adds flexible heads to a BERT-based model class.
-    """
+        if head or AdapterSetup.get_context_head_setup() or self.active_head:
+            head_outputs = self.forward_head(
+                head_inputs,
+                head_name=head,
+                attention_mask=attention_mask,
+                return_dict=return_dict,
+                pooled_output=pooled_output,
+                **kwargs,
+            )
+            return head_outputs
+        else:
+            # in case no head is used just return the output of the base model (including pooler output)
+            return outputs
 
     head_types = {
         "classification": ClassificationHead,
@@ -177,3 +235,37 @@ def add_causal_lm_head(self, head_name, activation_function="gelu", overwrite_ok
             self, head_name, layers=2, activation_function=activation_function, layer_norm=True, bias=True
         )
         self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class BertModelWithHeads(BertAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/distilbert.py b/src/transformers/adapters/models/distilbert.py
index 7b81664c9..c48458ac8 100644
--- a/src/transformers/adapters/models/distilbert.py
+++ b/src/transformers/adapters/models/distilbert.py
@@ -1,31 +1,271 @@
-from typing import Iterable, Tuple
+import warnings
 
 import torch.nn as nn
 
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-from .bert import BertModelHeadsMixin
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.distilbert.modeling_distilbert import (
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    DISTILBERT_INPUTS_DOCSTRING,
+    DISTILBERT_START_DOCSTRING,
+    DistilBertModel,
+    DistilBertPreTrainedModel,
+)
+from ..heads import (
+    BertStyleMaskedLMHead,
+    BiaffineParsingHead,
+    CausalLMHead,
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    MultipleChoiceHead,
+    QuestionAnsweringHead,
+    TaggingHead,
+)
 
 
-class DistilBertTransfomerBlockAdaptersMixin:
-    """Adds adapters to the TransformerBlock module of DistilBert."""
+@add_start_docstrings(
+    """DistilBert Model transformer with the option to add multiple flexible heads on top.""",
+    DISTILBERT_START_DOCSTRING,
+)
+class DistilBertAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, DistilBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.distilbert = DistilBertModel(config)
 
-    def _init_adapter_modules(self):
-        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
-        self.output_adapters = AdapterLayer("output_adapter", self.config)
-        self.attention_adapters._init_adapter_modules()
-        self.output_adapters._init_adapter_modules()
+        self._init_head_modules()
 
+        self.init_weights()
 
-class DistilBertModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    """Adds adapters to the DistilBert module."""
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings
+        """
+        return self.distilbert.get_position_embeddings()
 
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        for i, layer in enumerate(self.transformer.layer):
-            yield i, layer
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
+        config.max_position_embeddings`.
 
+        Arguments:
+            new_num_position_embeddings (:obj:`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
+                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
+                the size will remove vectors from the end.
+        """
+        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
 
-class DistilBertModelHeadsMixin(BertModelHeadsMixin):
-    """Adds heads to a DistilBert model."""
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint="distilbert-base-uncased",
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-    pass
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        outputs = self.forward_head(
+            distilbert_output, head_name=head, attention_mask=attention_mask, return_dict=return_dict, **kwargs
+        )
+
+        return outputs
+
+    head_types = {
+        "classification": ClassificationHead,
+        "multilabel_classification": MultiLabelClassificationHead,
+        "tagging": TaggingHead,
+        "multiple_choice": MultipleChoiceHead,
+        "question_answering": QuestionAnsweringHead,
+        "dependency_parsing": BiaffineParsingHead,
+        "masked_lm": BertStyleMaskedLMHead,
+        "causal_lm": CausalLMHead,
+    }
+
+    def add_classification_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        multilabel=False,
+        id2label=None,
+        use_pooler=False,
+    ):
+        """
+        Adds a sequence classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            multilabel (bool, optional): Enable multilabel classification setup. Defaults to False.
+        """
+
+        if multilabel:
+            head = MultiLabelClassificationHead(
+                self, head_name, num_labels, layers, activation_function, id2label, use_pooler
+            )
+        else:
+            head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label, use_pooler)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_multiple_choice_head(
+        self,
+        head_name,
+        num_choices=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        id2label=None,
+        use_pooler=False,
+    ):
+        """
+        Adds a multiple choice head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_choices (int, optional): Number of choices. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = MultipleChoiceHead(self, head_name, num_choices, layers, activation_function, id2label, use_pooler)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_tagging_head(
+        self, head_name, num_labels=2, layers=1, activation_function="tanh", overwrite_ok=False, id2label=None
+    ):
+        """
+        Adds a token classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 1.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = TaggingHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_qa_head(
+        self, head_name, num_labels=2, layers=1, activation_function="tanh", overwrite_ok=False, id2label=None
+    ):
+        head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_dependency_parsing_head(self, head_name, num_labels=2, overwrite_ok=False, id2label=None):
+        """
+        Adds a biaffine dependency parsing head on top of the model. The parsing head uses the architecture described
+        in "Is Supervised Syntactic Parsing Beneficial for Language Understanding? An Empirical Investigation" (Glavaš
+        & Vulić, 2021) (https://arxiv.org/pdf/2008.06788.pdf).
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of labels. Defaults to 2.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            id2label (dict, optional): Mapping from label ids to labels. Defaults to None.
+        """
+        head = BiaffineParsingHead(self, head_name, num_labels, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_masked_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False):
+        """
+        Adds a masked language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            activation_function (str, optional): Activation function. Defaults to 'gelu'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = BertStyleMaskedLMHead(self, head_name, activation_function=activation_function)
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+    def add_causal_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False):
+        """
+        Adds a causal language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            activation_function (str, optional): Activation function. Defaults to 'gelu'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = CausalLMHead(
+            self, head_name, layers=2, activation_function=activation_function, layer_norm=True, bias=True
+        )
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class DistilBertModelWithHeads(DistilBertAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/gpt2.py b/src/transformers/adapters/models/gpt2.py
index b43ebd257..0f30f2014 100644
--- a/src/transformers/adapters/models/gpt2.py
+++ b/src/transformers/adapters/models/gpt2.py
@@ -1,31 +1,103 @@
-from typing import Iterable, Tuple
+import logging
+import warnings
 
-import torch.nn as nn
+import torch
 
-from ..heads import CausalLMHead, ClassificationHead, MultiLabelClassificationHead, TaggingHead
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-from .bert import ModelWithFlexibleHeadsAdaptersMixin
+from ...file_utils import add_start_docstrings
+from ...models.gpt2.modeling_gpt2 import GPT2_START_DOCSTRING, GPT2Model, GPT2PreTrainedModel
+from ..composition import adjust_tensors_for_parallel
+from ..heads import (
+    CausalLMHead,
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    TaggingHead,
+)
 
 
-class GPT2DecoderBlockAdaptersMixin:
-    """Adds adapters to the TransformerBlock module of DistilBert."""
+logger = logging.getLogger(__name__)
 
-    def _init_adapter_modules(self):
-        self.attention_adapters = AdapterLayer("mh_adapter", self.config)
-        self.output_adapters = AdapterLayer("output_adapter", self.config)
-        self.attention_adapters._init_adapter_modules()
-        self.output_adapters._init_adapter_modules()
 
+@add_start_docstrings(
+    """
+The GPT2 Model that allows the loading of different heads dor different tasks. This enables a flexible use of the
+models and adpters. Since this class does classification on the last token, it requires to know the position of the
+last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding
+token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since
+it cannot guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same
+(take the last value in each row of the batch).
+""",
+    GPT2_START_DOCSTRING,
+)
+class GPT2AdapterModel(ModelWithFlexibleHeadsAdaptersMixin, GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPT2Model(config)
 
-class GPT2ModelAdapterMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        for i, layer in enumerate(self.base_model.h):
-            yield i, layer
+        self._init_head_modules()
 
+        self.init_weights()
 
-class GPT2ModelHeadsMixin(ModelWithFlexibleHeadsAdaptersMixin):
-    """Adds flexible heads to a GPT-2 model."""
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        batch_size = outputs[0].shape[0]
+
+        if self.config.pad_token_id is None:
+            # TODO-AH: this may result in unexpected behavior for classification. Find a better way to do this?
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                (sequence_lengths,) = adjust_tensors_for_parallel(outputs[0], sequence_lengths)
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        cls_logits = outputs[0][range(batch_size), sequence_lengths]
+
+        outputs = self.forward_head(
+            outputs,
+            head_name=head,
+            cls_output=cls_logits,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        return outputs
 
     head_types = {
         "classification": ClassificationHead,
@@ -72,3 +144,37 @@ def add_causal_lm_head(self, head_name, overwrite_ok=False):
         """
         head = CausalLMHead(self, head_name)
         self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class GPT2ModelWithHeads(GPT2AdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/mbart.py b/src/transformers/adapters/models/mbart.py
new file mode 100644
index 000000000..6e1cd3c75
--- /dev/null
+++ b/src/transformers/adapters/models/mbart.py
@@ -0,0 +1,262 @@
+import warnings
+
+import torch
+
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.mbart.modeling_mbart import (
+    _CHECKPOINT_FOR_DOC,
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    MBART_INPUTS_DOCSTRING,
+    MBART_START_DOCSTRING,
+    MBartConfig,
+    MBartModel,
+    MBartPreTrainedModel,
+    shift_tokens_right,
+)
+from ..composition import adjust_tensors_for_parallel
+from ..heads import (
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    QuestionAnsweringHead,
+    Seq2SeqLMHead,
+)
+
+
+@add_start_docstrings(
+    "MBART Model with the option to add multiple flexible prediction heads on top.", MBART_START_DOCSTRING
+)
+class MBartAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, MBartPreTrainedModel):
+    def __init__(self, config: MBartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = MBartModel(config)
+
+        self._init_head_modules()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        past_key_values=None,
+        head=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            past_key_values=past_key_values,
+        )
+        # sequence classification based on last token in sequence
+        x = outputs[0]  # last hidden state
+        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
+            eos_mask = input_ids.eq(self.config.eos_token_id)
+            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
+            if len(torch.unique(eos_mask.sum(1))) > 1:
+                raise ValueError("All examples must have the same number of <eos> tokens.")
+            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
+        else:
+            cls_representation = x
+
+        head_outputs = self.forward_head(
+            outputs,
+            head_name=head,
+            cls_output=cls_representation,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        return head_outputs
+
+    # Copied from MBartForConditionalGeneration
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    # Copied from MBartForConditionalGeneration
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id)
+
+    # Copied from MBartForConditionalGeneration
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+    head_types = {
+        "classification": ClassificationHead,
+        "multilabel_classification": MultiLabelClassificationHead,
+        "question_answering": QuestionAnsweringHead,
+        "seq2seq_lm": Seq2SeqLMHead,
+    }
+
+    def add_classification_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        multilabel=False,
+        id2label=None,
+    ):
+        """
+        Adds a sequence classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            multilabel (bool, optional): Enable multilabel classification setup. Defaults to False.
+        """
+
+        if multilabel:
+            head = MultiLabelClassificationHead(self, head_name, num_labels, layers, activation_function, id2label)
+        else:
+            head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_qa_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=1,
+        activation_function="tanh",
+        overwrite_ok=False,
+        id2label=None,
+    ):
+        head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_seq2seq_lm_head(
+        self,
+        head_name,
+        overwrite_ok=False,
+    ):
+        """
+        Adds a sequence-to-sequence language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = Seq2SeqLMHead(self, head_name)
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class MBartModelWithHeads(MBartAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/roberta.py b/src/transformers/adapters/models/roberta.py
new file mode 100644
index 000000000..4bcad514d
--- /dev/null
+++ b/src/transformers/adapters/models/roberta.py
@@ -0,0 +1,270 @@
+import warnings
+
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ...models.roberta.modeling_roberta import (
+    _CONFIG_FOR_DOC,
+    _TOKENIZER_FOR_DOC,
+    ROBERTA_INPUTS_DOCSTRING,
+    ROBERTA_START_DOCSTRING,
+    RobertaModel,
+    RobertaPreTrainedModel,
+)
+from ..context import AdapterSetup
+from ..heads import (
+    BertStyleMaskedLMHead,
+    BiaffineParsingHead,
+    CausalLMHead,
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    MultipleChoiceHead,
+    QuestionAnsweringHead,
+    TaggingHead,
+)
+
+
+@add_start_docstrings(
+    """Roberta Model transformer with the option to add multiple flexible heads on top.""",
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaAdapterModel(ModelWithFlexibleHeadsAdaptersMixin, RobertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = RobertaModel(config)
+
+        self._init_head_modules()
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint="roberta-base",
+        output_type=ModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # BERT & RoBERTa return the pooled output as second item, we don't need that in these heads
+        if not return_dict:
+            head_inputs = (outputs[0],) + outputs[2:]
+        else:
+            head_inputs = outputs
+        pooled_output = outputs[1]
+
+        if head or AdapterSetup.get_context_head_setup() or self.active_head:
+            head_outputs = self.forward_head(
+                head_inputs,
+                head_name=head,
+                attention_mask=attention_mask,
+                return_dict=return_dict,
+                pooled_output=pooled_output,
+                **kwargs,
+            )
+            return head_outputs
+        else:
+            # in case no head is used just return the output of the base model (including pooler output)
+            return outputs
+
+    head_types = {
+        "classification": ClassificationHead,
+        "multilabel_classification": MultiLabelClassificationHead,
+        "tagging": TaggingHead,
+        "multiple_choice": MultipleChoiceHead,
+        "question_answering": QuestionAnsweringHead,
+        "dependency_parsing": BiaffineParsingHead,
+        "masked_lm": BertStyleMaskedLMHead,
+        "causal_lm": CausalLMHead,
+    }
+
+    def add_classification_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        multilabel=False,
+        id2label=None,
+        use_pooler=False,
+    ):
+        """
+        Adds a sequence classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            multilabel (bool, optional): Enable multilabel classification setup. Defaults to False.
+        """
+
+        if multilabel:
+            head = MultiLabelClassificationHead(
+                self, head_name, num_labels, layers, activation_function, id2label, use_pooler
+            )
+        else:
+            head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label, use_pooler)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_multiple_choice_head(
+        self,
+        head_name,
+        num_choices=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        id2label=None,
+        use_pooler=False,
+    ):
+        """
+        Adds a multiple choice head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_choices (int, optional): Number of choices. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = MultipleChoiceHead(self, head_name, num_choices, layers, activation_function, id2label, use_pooler)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_tagging_head(
+        self, head_name, num_labels=2, layers=1, activation_function="tanh", overwrite_ok=False, id2label=None
+    ):
+        """
+        Adds a token classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 1.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = TaggingHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_qa_head(
+        self, head_name, num_labels=2, layers=1, activation_function="tanh", overwrite_ok=False, id2label=None
+    ):
+        head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_dependency_parsing_head(self, head_name, num_labels=2, overwrite_ok=False, id2label=None):
+        """
+        Adds a biaffine dependency parsing head on top of the model. The parsing head uses the architecture described
+        in "Is Supervised Syntactic Parsing Beneficial for Language Understanding? An Empirical Investigation" (Glavaš
+        & Vulić, 2021) (https://arxiv.org/pdf/2008.06788.pdf).
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of labels. Defaults to 2.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            id2label (dict, optional): Mapping from label ids to labels. Defaults to None.
+        """
+        head = BiaffineParsingHead(self, head_name, num_labels, id2label)
+        self.add_prediction_head(head, overwrite_ok)
+
+    def add_masked_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False):
+        """
+        Adds a masked language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            activation_function (str, optional): Activation function. Defaults to 'gelu'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = BertStyleMaskedLMHead(self, head_name, activation_function=activation_function)
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+    def add_causal_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False):
+        """
+        Adds a causal language modeling head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            activation_function (str, optional): Activation function. Defaults to 'gelu'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+        """
+        head = CausalLMHead(
+            self, head_name, layers=2, activation_function=activation_function, layer_norm=True, bias=True
+        )
+        self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class RobertaModelWithHeads(RobertaAdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/t5.py b/src/transformers/adapters/models/t5.py
index fa344bc5b..c1fa3021b 100644
--- a/src/transformers/adapters/models/t5.py
+++ b/src/transformers/adapters/models/t5.py
@@ -1,56 +1,162 @@
-from typing import Iterable, Tuple
-
-import torch.nn as nn
-
-from ..heads import Seq2SeqLMHead
-from ..layer import AdapterLayer
-from ..model_mixin import InvertibleAdaptersMixin, ModelAdaptersMixin
-from .bert import ModelWithFlexibleHeadsAdaptersMixin
-
-
-class T5SelfAttentionLayerAdaptersMixin(AdapterLayer):
-    def __init__(self):
-        super().__init__("mh_adapter", None)
-
-
-class T5CrossAttentionLayerAdaptersMixin(AdapterLayer):
-    def __init__(self):
-        super().__init__("cross_adapter", None)
-
-
-class T5FFLayerAdaptersMixin(AdapterLayer):
-    def __init__(self):
-        super().__init__("output_adapter", None)
-
-
-class T5ModelAdaptersMixin(InvertibleAdaptersMixin, ModelAdaptersMixin):
-    """Adds adapters to the T5Model class."""
-
-    def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
-        if hasattr(self, "encoder"):
-            for i, layer in enumerate(self.encoder.block):
-                yield i, layer
-            for i, layer in enumerate(self.decoder.block, start=len(self.encoder.block)):
-                yield i, layer
+import logging
+import warnings
+
+import torch
+
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from ...models.t5.modeling_t5 import T5_INPUTS_DOCSTRING, T5_START_DOCSTRING, T5Model, T5PreTrainedModel
+from ..heads import ModelWithFlexibleHeadsAdaptersMixin, Seq2SeqLMHead
+
+
+logger = logging.getLogger(__name__)
+
+
+@add_start_docstrings("T5 Model with the option to add multiple flexible prediction heads on top.", T5_START_DOCSTRING)
+class T5AdapterModel(ModelWithFlexibleHeadsAdaptersMixin, T5PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = T5Model(config)
+
+        self._init_head_modules()
+        self._init_adapter_modules()
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def get_encoder(self):
+        return self.transformer.encoder
+
+    def get_decoder(self):
+        return self.transformer.decoder
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        head=None,
+        **kwargs
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        model_output = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = model_output[0]
+        # ToDo move head to device for parallel forward pass
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            new_hidden_state = sequence_output * (self.config.d_model ** -0.5)
+            if isinstance(model_output, tuple):
+                model_output = (new_hidden_state,) + model_output[1:]
+            else:
+                model_output["last_hidden_state"] = new_hidden_state
+
+        if head or self.active_head:
+            kwargs["labels"] = labels
+            head_outputs = self.forward_head(
+                model_output,
+                head_name=head,
+                return_dict=return_dict,
+                **kwargs,
+            )
+            return head_outputs
         else:
-            for i, layer in enumerate(self.decoder.block):
-                yield i, layer
-
-    def _init_adapter_modules(self):
-        if hasattr(self, "encoder"):
-            # In T5, the invertible adapters are implemented by the encoder module.
-            # Therefore, relay mixin calls to the encoder here.
-            self.invertible_adapters = self.encoder.invertible_adapters
-            self.add_invertible_adapter = self.encoder.add_invertible_adapter
-            self.get_invertible_adapter = self.encoder.get_invertible_adapter
-            self.enable_invertible_adapters = self.encoder.enable_invertible_adapters
-            self.invertible_adapters_forward = self.encoder.invertible_adapters_forward
-            self.delete_invertible_adapter = self.encoder.delete_invertible_adapter
-        super()._init_adapter_modules()
-
-
-class T5ModelHeadsMixin(ModelWithFlexibleHeadsAdaptersMixin):
-    """Adds flexible heads to a T5 model."""
+            return model_output
+
+    # Copied from T5ForConditionalGeneration
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    # Copied from T5ForConditionalGeneration
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    # Copied from T5ForConditionalGeneration
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
+                )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+        return reordered_decoder_past
 
     head_types = {
         "seq2seq_lm": Seq2SeqLMHead,
@@ -66,3 +172,37 @@ def add_seq2seq_lm_head(self, head_name, overwrite_ok=False):
         """
         head = Seq2SeqLMHead(self, head_name)
         self.add_prediction_head(head, overwrite_ok=overwrite_ok)
+
+
+class T5ModelWithHeads(T5AdapterModel):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                self.__class__.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_config(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        warnings.warn(
+            "This class has been renamed to `{}` in v3. "
+            "Please use the new class instead as this class might be removed in a future version.".format(
+                cls.__bases__[0].__name__
+            ),
+            FutureWarning,
+        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/src/transformers/adapters/models/xlm_roberta.py b/src/transformers/adapters/models/xlm_roberta.py
new file mode 100644
index 000000000..62424f463
--- /dev/null
+++ b/src/transformers/adapters/models/xlm_roberta.py
@@ -0,0 +1,29 @@
+from ...file_utils import add_start_docstrings
+from ...models.xlm_roberta.modeling_xlm_roberta import XLM_ROBERTA_START_DOCSTRING, XLMRobertaConfig
+from .roberta import RobertaAdapterModel, RobertaModelWithHeads
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa Model with the option to add multiple flexible heads on top.""",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class XLMRobertaAdapterModel(RobertaAdapterModel):
+    """
+    This class overrides :class:`~transformers.RobertaAdapterModel`. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = XLMRobertaConfig
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa Model with the option to add multiple flexible heads on top.""",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class XLMRobertaModelWithHeads(RobertaModelWithHeads):
+    """
+    This class overrides :class:`~transformers.RobertaModelWithHeads`. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = XLMRobertaConfig
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 26cd92e35..0af13aa35 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2292,7 +2292,7 @@ def prune_layer(
 
 
 def apply_chunking_to_forward(
-    forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors, **kwargs
+    forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
 ) -> torch.Tensor:
     """
     This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
@@ -2330,11 +2330,7 @@ def forward(self, hidden_states):
     assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
 
     # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
-    forward_fn_params = inspect.signature(forward_fn).parameters
-    num_args_in_forward_chunk_fn = len(forward_fn_params)
-    # subtract one for kwargs
-    if "kwargs" in forward_fn_params:
-        num_args_in_forward_chunk_fn -= 1
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
     if num_args_in_forward_chunk_fn != len(input_tensors):
         raise ValueError(
             f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
@@ -2365,4 +2361,4 @@ def forward(self, hidden_states):
         # concatenate output at same dimension
         return torch.cat(output_chunks, dim=chunk_dim)
 
-    return forward_fn(*input_tensors, **kwargs)
+    return forward_fn(*input_tensors)
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 34f6447bb..98133afee 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -47,7 +47,6 @@
         "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
         "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "MODEL_MAPPING",
-        "MODEL_WITH_HEADS_MAPPING",
         "MODEL_WITH_LM_HEAD_MAPPING",
         "AutoModel",
         "AutoModelForAudioClassification",
@@ -66,7 +65,6 @@
         "AutoModelForSpeechSeq2Seq",
         "AutoModelForTableQuestionAnswering",
         "AutoModelForTokenClassification",
-        "AutoModelWithHeads",
         "AutoModelWithLMHead",
     ]
 
@@ -147,7 +145,6 @@
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_MAPPING,
-            MODEL_WITH_HEADS_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
             AutoModelForAudioClassification,
@@ -166,7 +163,6 @@
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
-            AutoModelWithHeads,
             AutoModelWithLMHead,
         )
 
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 12770253b..923d1fe59 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -100,19 +100,6 @@
     ]
 )
 
-MODEL_WITH_HEADS_MAPPING_NAMES = OrderedDict(
-    [
-        ("xlm-roberta", "XLMRobertaModelWithHeads"),
-        ("roberta", "RobertaModelWithHeads"),
-        ("bert", "BertModelWithHeads"),
-        ("distilbert", "DistilBertModelWithHeads"),
-        ("bart", "BartModelWithHeads"),
-        ("mbart", "MBartModelWithHeads"),
-        ("gpt2", "GPT2ModelWithHeads"),
-        ("t5", "T5ModelWithHeads"),
-    ]
-)
-
 MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
     [
         # Model for pre-training mapping
@@ -515,7 +502,6 @@
 )
 
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
-MODEL_WITH_HEADS_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_HEADS_MAPPING_NAMES)
 MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
 MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES)
 MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
@@ -567,13 +553,6 @@ class AutoModelForPreTraining(_BaseAutoModelClass):
 AutoModelForPreTraining = auto_class_update(AutoModelForPreTraining, head_doc="pretraining")
 
 
-class AutoModelWithHeads(_BaseAutoModelClass):
-    _model_mapping = MODEL_WITH_HEADS_MAPPING
-
-
-AutoModelWithHeads = auto_class_update(AutoModelWithHeads, head_doc="flexible heads")
-
-
 # Private on purpose, the public class will add the deprecation warnings.
 class _AutoModelWithLMHead(_BaseAutoModelClass):
     _model_mapping = MODEL_WITH_LM_HEAD_MAPPING
diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py
index ba62b452a..a8ddcecc4 100644
--- a/src/transformers/models/bart/__init__.py
+++ b/src/transformers/models/bart/__init__.py
@@ -36,7 +36,6 @@
         "BartForQuestionAnswering",
         "BartForSequenceClassification",
         "BartModel",
-        "BartModelWithHeads",
         "BartPretrainedModel",
         "PretrainedBartModel",
     ]
@@ -68,7 +67,6 @@
             BartForQuestionAnswering,
             BartForSequenceClassification,
             BartModel,
-            BartModelWithHeads,
             BartPretrainedModel,
             PretrainedBartModel,
         )
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 818db261a..31dc4a455 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -27,15 +27,13 @@
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
-from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
-from ...adapters.models.bart import (
+from ...adapters.mixins.bart import (
     BartDecoderLayerAdaptersMixin,
     BartEncoderLayerAdaptersMixin,
     BartModelAdaptersMixin,
-    BartModelHeadsMixin,
 )
+from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
 from ...file_utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
@@ -1251,143 +1249,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    "BART Model with the option to add multiple flexible prediction heads on top.", BART_START_DOCSTRING
-)
-class BartModelWithHeads(BartModelHeadsMixin, BartPretrainedModel):
-    def __init__(self, config: BartConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = BartModel(config)
-
-        self._init_head_modules()
-
-    def get_encoder(self):
-        return self.model.get_encoder()
-
-    def get_decoder(self):
-        return self.model.get_decoder()
-
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=ModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        past_key_values=None,
-        **kwargs
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            past_key_values=past_key_values,
-        )
-        # sequence classification based on last token in sequence
-        x = outputs[0]  # last hidden state
-        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
-            eos_mask = input_ids.eq(self.config.eos_token_id)
-            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
-            if len(torch.unique(eos_mask.sum(1))) > 1:
-                raise ValueError("All examples must have the same number of <eos> tokens.")
-            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
-        else:
-            cls_representation = x
-
-        head_outputs = self.forward_head(
-            outputs,
-            head_name=head,
-            cls_output=cls_representation,
-            attention_mask=attention_mask,
-            return_dict=return_dict,
-            **kwargs,
-        )
-
-        return head_outputs
-
-    # Copied from BartForConditionalGeneration
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-    ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    # Copied from BartForConditionalGeneration
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
-
-    # Copied from BartForConditionalGeneration
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            # cached cross_attention states don't have to be reordered -> they are always the same
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
-            )
-        return reordered_past
-
-
 @add_start_docstrings(
     "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
 )
diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py
index 1c539fb08..9bcf37228 100644
--- a/src/transformers/models/bert/__init__.py
+++ b/src/transformers/models/bert/__init__.py
@@ -42,7 +42,6 @@
         "BertLayer",
         "BertLMHeadModel",
         "BertModel",
-        "BertModelWithHeads",
         "BertPreTrainedModel",
         "load_tf_weights_in_bert",
     ]
@@ -97,7 +96,6 @@
             BertLayer,
             BertLMHeadModel,
             BertModel,
-            BertModelWithHeads,
             BertPreTrainedModel,
             load_tf_weights_in_bert,
         )
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 704b0141a..c53e4e1dc 100644
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -30,14 +30,9 @@
 
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
-from ...adapters.context import AdapterSetup, ForwardContext
+from ...adapters.context import ForwardContext
+from ...adapters.mixins.bert import BertModelAdaptersMixin, BertOutputAdaptersMixin, BertSelfOutputAdaptersMixin
 from ...adapters.model_mixin import ModelWithHeadsAdaptersMixin
-from ...adapters.models.bert import (
-    BertModelAdaptersMixin,
-    BertModelHeadsMixin,
-    BertOutputAdaptersMixin,
-    BertSelfOutputAdaptersMixin,
-)
 from ...file_utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1045,80 +1040,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    """Bert Model transformer with the option to add multiple flexible heads on top.""",
-    BERT_START_DOCSTRING,
-)
-class BertModelWithHeads(BertModelHeadsMixin, BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-
-        self._init_head_modules()
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-            if inputs_embeds is not None
-            else None
-        )
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # BERT & RoBERTa return the pooled output as second item, we don't need that in these heads
-        if not return_dict:
-            head_inputs = (outputs[0],) + outputs[2:]
-        else:
-            head_inputs = outputs
-        pooled_output = outputs[1]
-
-        if head or AdapterSetup.get_context_head_setup() or self.active_head:
-            head_outputs = self.forward_head(
-                head_inputs,
-                head_name=head,
-                attention_mask=attention_mask,
-                return_dict=return_dict,
-                pooled_output=pooled_output,
-                **kwargs,
-            )
-            return head_outputs
-        else:
-            # in case no head is used just return the output of the base model (including pooler output)
-            return outputs
-
-
 @add_start_docstrings(
     """
     Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
diff --git a/src/transformers/models/distilbert/__init__.py b/src/transformers/models/distilbert/__init__.py
index 1a7fdac64..3be3dda0f 100644
--- a/src/transformers/models/distilbert/__init__.py
+++ b/src/transformers/models/distilbert/__init__.py
@@ -42,7 +42,6 @@
         "DistilBertForSequenceClassification",
         "DistilBertForTokenClassification",
         "DistilBertModel",
-        "DistilBertModelWithHeads",
         "DistilBertPreTrainedModel",
     ]
 
@@ -91,7 +90,6 @@
             DistilBertForSequenceClassification,
             DistilBertForTokenClassification,
             DistilBertModel,
-            DistilBertModelWithHeads,
             DistilBertPreTrainedModel,
         )
 
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index d686b61cc..56e5bcb71 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -29,15 +29,10 @@
 from ...activations import gelu
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
+from ...adapters.mixins.distilbert import DistilBertModelAdaptersMixin, DistilBertTransfomerBlockAdaptersMixin
 from ...adapters.model_mixin import ModelWithHeadsAdaptersMixin
-from ...adapters.models.distilbert import (
-    DistilBertModelAdaptersMixin,
-    DistilBertModelHeadsMixin,
-    DistilBertTransfomerBlockAdaptersMixin,
-)
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...file_utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -576,86 +571,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    """DistilBert Model transformer with the option to add multiple flexible heads on top.""",
-    DISTILBERT_START_DOCSTRING,
-)
-class DistilBertModelWithHeads(DistilBertModelHeadsMixin, DistilBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.distilbert = DistilBertModel(config)
-
-        self._init_head_modules()
-
-        self.init_weights()
-
-    def get_position_embeddings(self) -> nn.Embedding:
-        """
-        Returns the position embeddings
-        """
-        return self.distilbert.get_position_embeddings()
-
-    def resize_position_embeddings(self, new_num_position_embeddings: int):
-        """
-        Resizes position embeddings of the model if :obj:`new_num_position_embeddings !=
-        config.max_position_embeddings`.
-
-        Arguments:
-            new_num_position_embeddings (:obj:`int`):
-                The number of new position embedding matrix. If position embeddings are learned, increasing the size
-                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
-                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
-                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
-                the size will remove vectors from the end.
-        """
-        self.distilbert.resize_position_embeddings(new_num_position_embeddings)
-
-    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="distilbert-base-uncased",
-        output_type=ModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-            if inputs_embeds is not None
-            else None
-        )
-
-        distilbert_output = self.distilbert(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        outputs = self.forward_head(
-            distilbert_output, head_name=head, attention_mask=attention_mask, return_dict=return_dict, **kwargs
-        )
-
-        return outputs
-
-
 @add_start_docstrings(
     """DistilBert Model with a `masked language modeling` head on top. """,
     DISTILBERT_START_DOCSTRING,
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 8eeeb9cbd..88081c65e 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -21,7 +21,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...adapters.context import ForwardContext
-from ...adapters.models.encoder_decoder import EncoderDecoderModelAdaptersMixin
+from ...adapters.mixins.encoder_decoder import EncoderDecoderModelAdaptersMixin
 from ...configuration_utils import PretrainedConfig
 from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import Seq2SeqLMOutput
diff --git a/src/transformers/models/gpt2/__init__.py b/src/transformers/models/gpt2/__init__.py
index 12ab7099e..7169ddc63 100644
--- a/src/transformers/models/gpt2/__init__.py
+++ b/src/transformers/models/gpt2/__init__.py
@@ -37,7 +37,6 @@
         "GPT2ForTokenClassification",
         "GPT2LMHeadModel",
         "GPT2Model",
-        "GPT2ModelWithHeads",
         "GPT2PreTrainedModel",
         "load_tf_weights_in_gpt2",
     ]
@@ -71,7 +70,6 @@
             GPT2ForTokenClassification,
             GPT2LMHeadModel,
             GPT2Model,
-            GPT2ModelWithHeads,
             GPT2PreTrainedModel,
             load_tf_weights_in_gpt2,
         )
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 0f6fe2c02..3a89e62e9 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -36,8 +36,8 @@
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
+from ...adapters.mixins.gpt2 import GPT2DecoderBlockAdaptersMixin, GPT2ModelAdapterMixin
 from ...adapters.model_mixin import ModelWithHeadsAdaptersMixin
-from ...adapters.models.gpt2 import GPT2DecoderBlockAdaptersMixin, GPT2ModelAdapterMixin, GPT2ModelHeadsMixin
 from ...file_utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1542,85 +1542,3 @@ def forward(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-
-@add_start_docstrings(
-    """
-The GPT2 Model that allows the loading of different heads dor different tasks. This enables a flexible use of the
-models and adpters. Since this class does classification on the last token, it requires to know the position of the
-last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding
-token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since
-it cannot guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same
-(take the last value in each row of the batch).
-""",
-    GPT2_START_DOCSTRING,
-)
-class GPT2ModelWithHeads(GPT2ModelHeadsMixin, GPT2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = GPT2Model(config)
-
-        self._init_head_modules()
-
-        self.init_weights()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        batch_size = outputs[0].shape[0]
-
-        if self.config.pad_token_id is None:
-            # TODO-AH: this may result in unexpected behavior for classification. Find a better way to do this?
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
-                (sequence_lengths,) = adjust_tensors_for_parallel(outputs[0], sequence_lengths)
-            else:
-                sequence_lengths = -1
-                logger.warning(
-                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-                )
-
-        cls_logits = outputs[0][range(batch_size), sequence_lengths]
-
-        outputs = self.forward_head(
-            outputs,
-            head_name=head,
-            cls_output=cls_logits,
-            attention_mask=attention_mask,
-            return_dict=return_dict,
-            **kwargs,
-        )
-
-        return outputs
diff --git a/src/transformers/models/mbart/__init__.py b/src/transformers/models/mbart/__init__.py
index 2363c5584..613c90afb 100644
--- a/src/transformers/models/mbart/__init__.py
+++ b/src/transformers/models/mbart/__init__.py
@@ -45,7 +45,6 @@
         "MBartForQuestionAnswering",
         "MBartForSequenceClassification",
         "MBartModel",
-        "MBartModelWithHeads",
         "MBartPreTrainedModel",
     ]
 
@@ -83,7 +82,6 @@
             MBartForQuestionAnswering,
             MBartForSequenceClassification,
             MBartModel,
-            MBartModelWithHeads,
             MBartPreTrainedModel,
         )
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 4cf04b6ca..0a998b686 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -26,15 +26,13 @@
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
-from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
-from ...adapters.models.bart import (
+from ...adapters.mixins.bart import (
     BartDecoderLayerAdaptersMixin,
     BartEncoderLayerAdaptersMixin,
     BartModelAdaptersMixin,
-    BartModelHeadsMixin,
 )
+from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
 from ...file_utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
@@ -1251,143 +1249,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    "MBART Model with the option to add multiple flexible prediction heads on top.", MBART_START_DOCSTRING
-)
-class MBartModelWithHeads(BartModelHeadsMixin, MBartPreTrainedModel):
-    def __init__(self, config: MBartConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = MBartModel(config)
-
-        self._init_head_modules()
-
-    def get_encoder(self):
-        return self.model.get_encoder()
-
-    def get_decoder(self):
-        return self.model.get_decoder()
-
-    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/mbart-large-cc25",
-        output_type=ModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        past_key_values=None,
-        **kwargs
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if "labels" in kwargs or "start_positions" in kwargs and "end_positions" in kwargs:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            past_key_values=past_key_values,
-        )
-        # sequence classification based on last token in sequence
-        x = outputs[0]  # last hidden state
-        if input_ids is not None and x.shape[1] == input_ids.shape[1]:
-            eos_mask = input_ids.eq(self.config.eos_token_id)
-            (eos_mask,) = adjust_tensors_for_parallel(x, eos_mask)
-            if len(torch.unique(eos_mask.sum(1))) > 1:
-                raise ValueError("All examples must have the same number of <eos> tokens.")
-            cls_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
-        else:
-            cls_representation = x
-
-        head_outputs = self.forward_head(
-            outputs,
-            head_name=head,
-            cls_output=cls_representation,
-            attention_mask=attention_mask,
-            return_dict=return_dict,
-            **kwargs,
-        )
-
-        return head_outputs
-
-    # Copied from MBartForConditionalGeneration
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-    ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    # Copied from MBartForConditionalGeneration
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id)
-
-    # Copied from MBartForConditionalGeneration
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            # cached cross_attention states don't have to be reordered -> they are always the same
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
-            )
-        return reordered_past
-
-
 @add_start_docstrings(
     "The MBART Model with a language modeling head. Can be used for summarization.", MBART_START_DOCSTRING
 )
diff --git a/src/transformers/models/roberta/__init__.py b/src/transformers/models/roberta/__init__.py
index 75d95a097..91058cf04 100644
--- a/src/transformers/models/roberta/__init__.py
+++ b/src/transformers/models/roberta/__init__.py
@@ -39,7 +39,6 @@
         "RobertaForSequenceClassification",
         "RobertaForTokenClassification",
         "RobertaModel",
-        "RobertaModelWithHeads",
         "RobertaPreTrainedModel",
     ]
 
@@ -86,7 +85,6 @@
             RobertaForSequenceClassification,
             RobertaForTokenClassification,
             RobertaModel,
-            RobertaModelWithHeads,
             RobertaPreTrainedModel,
         )
 
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index cc543e948..897c76c54 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -25,16 +25,10 @@
 
 from ...activations import ACT2FN, gelu
 from ...adapters.composition import adjust_tensors_for_parallel
-from ...adapters.context import AdapterSetup, ForwardContext
+from ...adapters.context import ForwardContext
+from ...adapters.mixins.bert import BertModelAdaptersMixin, BertOutputAdaptersMixin, BertSelfOutputAdaptersMixin
 from ...adapters.model_mixin import ModelWithHeadsAdaptersMixin
-from ...adapters.models.bert import (
-    BertModelAdaptersMixin,
-    BertModelHeadsMixin,
-    BertOutputAdaptersMixin,
-    BertSelfOutputAdaptersMixin,
-)
 from ...file_utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -898,86 +892,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    """Roberta Model transformer with the option to add multiple flexible heads on top.""",
-    ROBERTA_START_DOCSTRING,
-)
-class RobertaModelWithHeads(BertModelHeadsMixin, RobertaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.roberta = RobertaModel(config)
-
-        self._init_head_modules()
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="roberta-base",
-        output_type=ModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-            if inputs_embeds is not None
-            else None
-        )
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # BERT & RoBERTa return the pooled output as second item, we don't need that in these heads
-        if not return_dict:
-            head_inputs = (outputs[0],) + outputs[2:]
-        else:
-            head_inputs = outputs
-        pooled_output = outputs[1]
-
-        if head or AdapterSetup.get_context_head_setup() or self.active_head:
-            head_outputs = self.forward_head(
-                head_inputs,
-                head_name=head,
-                attention_mask=attention_mask,
-                return_dict=return_dict,
-                pooled_output=pooled_output,
-                **kwargs,
-            )
-            return head_outputs
-        else:
-            # in case no head is used just return the output of the base model (including pooler output)
-            return outputs
-
-
 @add_start_docstrings(
     """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning. """, ROBERTA_START_DOCSTRING
 )
diff --git a/src/transformers/models/t5/__init__.py b/src/transformers/models/t5/__init__.py
index 53d594673..0b6e8f8ac 100644
--- a/src/transformers/models/t5/__init__.py
+++ b/src/transformers/models/t5/__init__.py
@@ -44,7 +44,6 @@
         "T5EncoderModel",
         "T5ForConditionalGeneration",
         "T5Model",
-        "T5ModelWithHeads",
         "T5PreTrainedModel",
         "load_tf_weights_in_t5",
     ]
@@ -81,7 +80,6 @@
             T5EncoderModel,
             T5ForConditionalGeneration,
             T5Model,
-            T5ModelWithHeads,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
         )
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index ef3110773..5a325d8ed 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -28,14 +28,13 @@
 from ...activations import ACT2FN
 from ...adapters.composition import adjust_tensors_for_parallel
 from ...adapters.context import ForwardContext
-from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
-from ...adapters.models.t5 import (
+from ...adapters.mixins.t5 import (
     T5CrossAttentionLayerAdaptersMixin,
     T5FFLayerAdaptersMixin,
     T5ModelAdaptersMixin,
-    T5ModelHeadsMixin,
     T5SelfAttentionLayerAdaptersMixin,
 )
+from ...adapters.model_mixin import InvertibleAdaptersMixin, ModelWithHeadsAdaptersMixin
 from ...file_utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1862,149 +1861,3 @@ def forward(
         )
 
         return encoder_outputs
-
-
-class T5ModelWithHeads(T5ModelHeadsMixin, T5PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.transformer = T5Model(config)
-
-        self._init_head_modules()
-        self._init_adapter_modules()
-        self.init_weights()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    def get_encoder(self):
-        return self.transformer.encoder
-
-    def get_decoder(self):
-        return self.transformer.decoder
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        head=None,
-        **kwargs
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-            # get decoder inputs from shifting lm labels to the right
-            decoder_input_ids = self._shift_right(labels)
-
-        model_output = self.transformer(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = model_output[0]
-        # ToDo move head to device for parallel forward pass
-
-        if self.config.tie_word_embeddings:
-            # Rescale output before projecting on vocab
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            new_hidden_state = sequence_output * (self.config.d_model ** -0.5)
-            if isinstance(model_output, tuple):
-                model_output = (new_hidden_state,) + model_output[1:]
-            else:
-                model_output["last_hidden_state"] = new_hidden_state
-
-        if head or self.active_head:
-            kwargs["labels"] = labels
-            head_outputs = self.forward_head(
-                model_output,
-                head_name=head,
-                return_dict=return_dict,
-                **kwargs,
-            )
-            return head_outputs
-        else:
-            return model_output
-
-    # Copied from T5ForConditionalGeneration
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
-    ):
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            "decoder_input_ids": input_ids,
-            "past_key_values": past,
-            "encoder_outputs": encoder_outputs,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,
-        }
-
-    # Copied from T5ForConditionalGeneration
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return self._shift_right(labels)
-
-    # Copied from T5ForConditionalGeneration
-    def _reorder_cache(self, past, beam_idx):
-        # if decoder past is not included in output
-        # speedy decoding is disabled and no need to reorder
-        if past is None:
-            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
-            return past
-
-        reordered_decoder_past = ()
-        for layer_past_states in past:
-            # get the correct batch idx from layer past batch dim
-            # batch dim of `past` is at 2nd position
-            reordered_layer_past_states = ()
-            for layer_past_state in layer_past_states:
-                # need to set correct `past` for each of the four key / value states
-                reordered_layer_past_states = reordered_layer_past_states + (
-                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
-                )
-
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
-
-            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
-        return reordered_decoder_past
diff --git a/src/transformers/models/xlm_roberta/__init__.py b/src/transformers/models/xlm_roberta/__init__.py
index e2d30f699..7ef5dd2c0 100644
--- a/src/transformers/models/xlm_roberta/__init__.py
+++ b/src/transformers/models/xlm_roberta/__init__.py
@@ -51,7 +51,6 @@
         "XLMRobertaForSequenceClassification",
         "XLMRobertaForTokenClassification",
         "XLMRobertaModel",
-        "XLMRobertaModelWithHeads",
     ]
 
 if is_tf_available():
@@ -89,7 +88,6 @@
             XLMRobertaForSequenceClassification,
             XLMRobertaForTokenClassification,
             XLMRobertaModel,
-            XLMRobertaModelWithHeads,
         )
 
     if is_tf_available():
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 90f879f76..edcf15187 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -25,7 +25,6 @@
     RobertaForSequenceClassification,
     RobertaForTokenClassification,
     RobertaModel,
-    RobertaModelWithHeads,
 )
 from .configuration_xlm_roberta import XLMRobertaConfig
 
@@ -74,19 +73,6 @@ class XLMRobertaModel(RobertaModel):
     config_class = XLMRobertaConfig
 
 
-@add_start_docstrings(
-    """XLM-RoBERTa Model with the option to add multiple flexible heads on top.""",
-    XLM_ROBERTA_START_DOCSTRING,
-)
-class XLMRobertaModelWithHeads(RobertaModelWithHeads):
-    """
-    This class overrides :class:`~transformers.RobertaModelWithHeads`. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
-
-    config_class = XLMRobertaConfig
-
-
 @add_start_docstrings(
     "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
     XLM_ROBERTA_START_DOCSTRING,
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index ddbec9ad8..7588eabad 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -27,11 +27,11 @@
 
 from packaging import version
 
+from ..adapters.models.auto import ADAPTER_MODEL_MAPPING
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
 from ..file_utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available
 from ..modelcard import ModelCard
 from ..models.auto.configuration_auto import AutoConfig
-from ..models.auto.modeling_auto import MODEL_WITH_HEADS_MAPPING
 from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import logging
 
@@ -815,7 +815,7 @@ def check_model_type(self, supported_models: Union[List[str], dict]):
                 else:
                     supported_models_names.append(model.__name__)
             supported_models = supported_models_names
-        for item in MODEL_WITH_HEADS_MAPPING.values():
+        for item in ADAPTER_MODEL_MAPPING.values():
             supported_models.append(item.__name__)
         if self.model.__class__.__name__ not in supported_models:
             logger.error(
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
deleted file mode 100644
index e09fefc41..000000000
--- a/tests/test_adapter.py
+++ /dev/null
@@ -1,442 +0,0 @@
-import random
-import unittest
-
-import torch
-from datasets import load_dataset
-
-from tests.test_adapter_embeddings import EmbeddingTestMixin
-from transformers import (
-    AutoModel,
-    AutoModelForSeq2SeqLM,
-    BartConfig,
-    BertConfig,
-    DistilBertConfig,
-    EncoderDecoderConfig,
-    EncoderDecoderModel,
-    GlueDataset,
-    GlueDataTrainingArguments,
-    GPT2Config,
-    MBartConfig,
-    RobertaConfig,
-    T5Config,
-    XLMRobertaConfig,
-)
-from transformers.testing_utils import require_torch, torch_device
-
-from .test_adapter_common import AdapterModelTestMixin
-from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
-from .test_adapter_conversion import ModelClassConversionTestMixin
-from .test_adapter_fusion_common import AdapterFusionModelTestMixin
-from .test_adapter_heads import PredictionHeadModelTestMixin
-from .test_adapter_training import AdapterTrainingTestMixin
-
-
-def make_config(config_class, **kwargs):
-    return staticmethod(lambda: config_class(**kwargs))
-
-
-class AdapterTestBase:
-    # If not overriden by subclass, AutoModel should be used.
-    model_class = AutoModel
-
-    def get_model(self):
-        if self.model_class == AutoModel:
-            model = AutoModel.from_config(self.config())
-        else:
-            model = self.model_class(self.config())
-        model.to(torch_device)
-        return model
-
-    def get_input_samples(self, shape, vocab_size=5000, config=None):
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(random.randint(0, vocab_size - 1))
-        input_ids = torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
-        # this is needed e.g. for BART
-        if config and config.eos_token_id is not None and config.eos_token_id < vocab_size:
-            input_ids[input_ids == config.eos_token_id] = random.randint(0, config.eos_token_id - 1)
-            input_ids[:, -1] = config.eos_token_id
-        in_data = {"input_ids": input_ids}
-
-        if config and config.is_encoder_decoder:
-            in_data["decoder_input_ids"] = input_ids.clone()
-        return in_data
-
-    def add_head(self, model, name, **kwargs):
-        model.add_classification_head(name, **kwargs)
-
-    def dataset(self, tokenizer):
-        data_args = GlueDataTrainingArguments(
-            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
-        )
-        return GlueDataset(data_args, tokenizer=tokenizer, mode="train")
-
-
-class BertAdapterTestBase(AdapterTestBase):
-    config_class = BertConfig
-    config = make_config(
-        BertConfig,
-        hidden_size=32,
-        num_hidden_layers=4,
-        num_attention_heads=4,
-        intermediate_size=37,
-    )
-    tokenizer_name = "bert-base-uncased"
-
-
-@require_torch
-class BertAdapterTest(
-    EmbeddingTestMixin,
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTrainingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    BertAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class BertClassConversionTest(
-    ModelClassConversionTestMixin,
-    BertAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class RobertaAdapterTestBase(AdapterTestBase):
-    config_class = RobertaConfig
-    config = make_config(
-        RobertaConfig,
-        hidden_size=32,
-        num_hidden_layers=4,
-        num_attention_heads=4,
-        intermediate_size=37,
-    )
-
-
-@require_torch
-class RobertaAdapterTest(
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    RobertaAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class RobertaClassConversionTest(
-    ModelClassConversionTestMixin,
-    RobertaAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class XLMRobertaClassConversionTest(
-    ModelClassConversionTestMixin,
-    AdapterTestBase,
-    unittest.TestCase,
-):
-    config_class = XLMRobertaConfig
-    config = make_config(
-        XLMRobertaConfig,
-        hidden_size=32,
-        num_hidden_layers=4,
-        num_attention_heads=4,
-        intermediate_size=37,
-    )
-
-
-class DistilBertAdapterTestBase(AdapterTestBase):
-    config_class = DistilBertConfig
-    config = make_config(
-        DistilBertConfig,
-        dim=32,
-        n_layers=4,
-        n_heads=4,
-        hidden_dim=37,
-    )
-    tokenizer_name = "distilbert-base-uncased"
-
-
-@require_torch
-class DistilBertAdapterTest(
-    AdapterModelTestMixin,
-    EmbeddingTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTrainingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    DistilBertAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class DistilBertClassConversionTest(
-    ModelClassConversionTestMixin,
-    DistilBertAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class BartAdapterTestBase(AdapterTestBase):
-    config_class = BartConfig
-    config = make_config(
-        BartConfig,
-        d_model=16,
-        encoder_layers=2,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        encoder_ffn_dim=4,
-        decoder_ffn_dim=4,
-    )
-    tokenizer_name = "facebook/bart-base"
-
-
-@require_torch
-class BartAdapterTest(
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    EmbeddingTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTrainingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    BartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class BartClassConversionTest(
-    ModelClassConversionTestMixin,
-    BartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class MBartAdapterTestBase(AdapterTestBase):
-    config_class = MBartConfig
-    config = make_config(
-        MBartConfig,
-        d_model=16,
-        encoder_layers=2,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        encoder_ffn_dim=4,
-        decoder_ffn_dim=4,
-    )
-
-
-@require_torch
-class MBartAdapterTest(
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    MBartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class MBartClassConversionTest(
-    ModelClassConversionTestMixin,
-    MBartAdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class GPT2AdapterTestBase(AdapterTestBase):
-    config_class = GPT2Config
-    config = make_config(
-        GPT2Config,
-        n_embd=32,
-        n_layer=4,
-        n_head=4,
-        # set pad token to eos token
-        pad_token_id=50256,
-    )
-    tokenizer_name = "gpt2"
-
-
-@require_torch
-class GPT2AdapterTest(
-    AdapterModelTestMixin,
-    EmbeddingTestMixin,
-    AdapterFusionModelTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTrainingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    GPT2AdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class GPT2ClassConversionTest(
-    ModelClassConversionTestMixin,
-    GPT2AdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-class EncoderDecoderAdapterTestBase(AdapterTestBase):
-    model_class = EncoderDecoderModel
-    config_class = EncoderDecoderConfig
-    config = staticmethod(
-        lambda: EncoderDecoderConfig.from_encoder_decoder_configs(
-            BertConfig(
-                hidden_size=32,
-                num_hidden_layers=4,
-                num_attention_heads=4,
-                intermediate_size=37,
-            ),
-            BertConfig(
-                hidden_size=32,
-                num_hidden_layers=4,
-                num_attention_heads=4,
-                intermediate_size=37,
-                is_decoder=True,
-                add_cross_attention=True,
-            ),
-        )
-    )
-    tokenizer_name = "bert-base-uncased"
-
-
-@require_torch
-class EncoderDecoderAdapterTest(
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    EncoderDecoderAdapterTestBase,
-    unittest.TestCase,
-):
-    def test_invertible_adapter_with_head(self):
-        """This test class is copied and adapted from the identically-named test in test_adapter_heads.py."""
-        model = AutoModelForSeq2SeqLM.from_config(self.config())
-        model.add_adapter("test", config="pfeiffer+inv")
-        model.set_active_adapters("test")
-
-        # Set a hook before the invertible adapter to make sure it's actually called twice:
-        # Once after the embedding layer and once in the prediction head.
-        calls = 0
-
-        def forward_pre_hook(module, input):
-            nonlocal calls
-            calls += 1
-
-        inv_adapter = model.base_model.get_invertible_adapter()
-        self.assertIsNotNone(inv_adapter)
-        inv_adapter.register_forward_pre_hook(forward_pre_hook)
-
-        in_data = self.get_input_samples((1, 128), config=model.config)
-        model.to(torch_device)
-        out = model(**in_data)
-
-        self.assertEqual((1, 128, model.config.decoder.vocab_size), out[0].shape)
-        self.assertEqual(2, calls)
-
-
-@require_torch
-class T5AdapterTestBase(AdapterTestBase):
-    config_class = T5Config
-    config = make_config(
-        T5Config,
-        d_model=16,
-        num_layers=2,
-        num_decoder_layers=2,
-        num_heads=4,
-        d_ff=4,
-        d_kv=16 // 4,
-        tie_word_embeddings=False,
-        decoder_start_token_id=0,
-    )
-    tokenizer_name = "t5-base"
-
-    def add_head(self, model, name, **kwargs):
-        model.add_seq2seq_lm_head(name)
-
-    def dataset(self, tokenizer):
-        def preprocess_function(examples):
-            inputs = examples["document"]
-            targets = examples["summary"]
-            inputs = ["Summarize: " + inp for inp in inputs]
-            model_inputs = tokenizer(inputs, padding=True, truncation=True)
-
-            # Setup the tokenizer for targets
-            with tokenizer.as_target_tokenizer():
-                labels = tokenizer(targets, padding=True, truncation=True)
-
-            # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-            # padding in the loss.
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-            model_inputs["labels"] = labels["input_ids"]
-            return model_inputs
-
-        data_args = {
-            "task_name": "xsum",
-            "path": "./tests/fixtures/tests_samples/xsum/sample.json",
-        }
-        dataset = load_dataset("json", data_files=data_args["path"])
-        train_dataset = dataset["train"]
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            desc="Running tokenizer on train dataset",
-        )
-        return train_dataset
-
-
-@require_torch
-class T5AdapterTest(
-    T5AdapterTestBase,
-    EmbeddingTestMixin,
-    ParallelAdapterInferenceTestMixin,
-    ParallelTrainingMixin,
-    AdapterModelTestMixin,
-    AdapterFusionModelTestMixin,
-    AdapterTrainingTestMixin,
-    PredictionHeadModelTestMixin,
-    AdapterTestBase,
-    unittest.TestCase,
-):
-    pass
-
-
-@require_torch
-class T5ClassConversionTest(
-    ModelClassConversionTestMixin,
-    T5AdapterTestBase,
-    unittest.TestCase,
-):
-    pass
diff --git a/tests/test_modeling_auto.py b/tests/test_modeling_auto.py
index c21dd6995..ea5cbff62 100644
--- a/tests/test_modeling_auto.py
+++ b/tests/test_modeling_auto.py
@@ -73,7 +73,6 @@
         MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
-        MODEL_WITH_HEADS_MAPPING,
         MODEL_WITH_LM_HEAD_MAPPING,
     )
     from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -297,7 +296,6 @@ def test_parents_and_children_in_mappings(self):
 
         mappings = (
             MODEL_MAPPING,
-            MODEL_WITH_HEADS_MAPPING,
             MODEL_FOR_PRETRAINING_MAPPING,
             MODEL_FOR_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 27505dfa9..957350b82 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -40,7 +40,6 @@
         BartForQuestionAnswering,
         BartForSequenceClassification,
         BartModel,
-        BartModelWithHeads,
         BartTokenizer,
         pipeline,
     )
@@ -408,13 +407,7 @@ def _get_embs(m):
 @require_torch
 class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (
-            BartModel,
-            BartModelWithHeads,
-            BartForConditionalGeneration,
-            BartForSequenceClassification,
-            BartForQuestionAnswering,
-        )
+        (BartModel, BartForConditionalGeneration, BartForSequenceClassification, BartForQuestionAnswering)
         if is_torch_available()
         else ()
     )
diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py
index e763025aa..7b7f02a55 100644
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -39,7 +39,6 @@
         BertForTokenClassification,
         BertLMHeadModel,
         BertModel,
-        BertModelWithHeads,
     )
     from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
 
@@ -432,7 +431,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             BertModel,
-            BertModelWithHeads,
             BertLMHeadModel,
             BertForMaskedLM,
             BertForMultipleChoice,
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index f367c7c6b..1cb39e799 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -65,11 +65,9 @@
         MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
-        MODEL_WITH_HEADS_MAPPING,
         AdaptiveEmbedding,
         BertConfig,
         BertModel,
-        ModelWithHeadsAdaptersMixin,
         PretrainedConfig,
         PreTrainedModel,
         T5Config,
@@ -412,7 +410,7 @@ def test_training(self):
         config.return_dict = True
 
         for model_class in self.all_model_classes:
-            if model_class in get_values(MODEL_MAPPING) or model_class in get_values(MODEL_WITH_HEADS_MAPPING):
+            if model_class in get_values(MODEL_MAPPING):
                 continue
             model = model_class(config)
             model.to(torch_device)
@@ -430,11 +428,7 @@ def test_training_gradient_checkpointing(self):
         config.return_dict = True
 
         for model_class in self.all_model_classes:
-            if (
-                model_class in get_values(MODEL_MAPPING)
-                or model_class in get_values(MODEL_WITH_HEADS_MAPPING)
-                or not model_class.supports_gradient_checkpointing
-            ):
+            if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing:
                 continue
             model = model_class(config)
             model.to(torch_device)
@@ -1331,7 +1325,7 @@ def test_correct_missing_keys(self):
             model = model_class(config)
             base_model_prefix = model.base_model_prefix
 
-            if hasattr(model, base_model_prefix) and not isinstance(model, ModelWithHeadsAdaptersMixin):
+            if hasattr(model, base_model_prefix):
                 with tempfile.TemporaryDirectory() as temp_dir_name:
                     model.base_model.save_pretrained(temp_dir_name)
                     model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py
index 7434f1047..ed7fba94b 100644
--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -34,7 +34,6 @@
         DistilBertForSequenceClassification,
         DistilBertForTokenClassification,
         DistilBertModel,
-        DistilBertModelWithHeads,
     )
 
 
@@ -201,7 +200,6 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DistilBertModel,
-            DistilBertModelWithHeads,
             DistilBertForMaskedLM,
             DistilBertForMultipleChoice,
             DistilBertForQuestionAnswering,
diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
index 3de5cdb80..462c6456d 100644
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -36,7 +36,6 @@
         GPT2ForTokenClassification,
         GPT2LMHeadModel,
         GPT2Model,
-        GPT2ModelWithHeads,
         GPT2Tokenizer,
     )
 
@@ -429,14 +428,7 @@ def prepare_config_and_inputs_for_common(self):
 class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
 
     all_model_classes = (
-        (
-            GPT2Model,
-            GPT2LMHeadModel,
-            GPT2DoubleHeadsModel,
-            GPT2ForSequenceClassification,
-            GPT2ForTokenClassification,
-            GPT2ModelWithHeads,
-        )
+        (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, GPT2ForTokenClassification)
         if is_torch_available()
         else ()
     )
diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py
index 368cf5d5b..229eb96e9 100644
--- a/tests/test_modeling_mbart.py
+++ b/tests/test_modeling_mbart.py
@@ -39,7 +39,6 @@
         MBartForQuestionAnswering,
         MBartForSequenceClassification,
         MBartModel,
-        MBartModelWithHeads,
     )
     from transformers.models.mbart.modeling_mbart import MBartDecoder, MBartEncoder
 
@@ -219,13 +218,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 @require_torch
 class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (
-            MBartModel,
-            MBartModelWithHeads,
-            MBartForConditionalGeneration,
-            MBartForSequenceClassification,
-            MBartForQuestionAnswering,
-        )
+        (MBartModel, MBartForConditionalGeneration, MBartForSequenceClassification, MBartForQuestionAnswering)
         if is_torch_available()
         else ()
     )
diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py
index f451bdf00..0f700009e 100644
--- a/tests/test_modeling_roberta.py
+++ b/tests/test_modeling_roberta.py
@@ -36,7 +36,6 @@
         RobertaForSequenceClassification,
         RobertaForTokenClassification,
         RobertaModel,
-        RobertaModelWithHeads,
     )
     from transformers.models.roberta.modeling_roberta import (
         ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -348,7 +347,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
             RobertaForCausalLM,
             RobertaForMaskedLM,
             RobertaModel,
-            RobertaModelWithHeads,
             RobertaForSequenceClassification,
             RobertaForTokenClassification,
             RobertaForMultipleChoice,
diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py
index 60c378b13..575850aa9 100644
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -18,7 +18,7 @@
 import tempfile
 import unittest
 
-from transformers import is_torch_available
+from transformers import T5Config, is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
@@ -30,15 +30,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import (
-        ByT5Tokenizer,
-        T5Config,
-        T5EncoderModel,
-        T5ForConditionalGeneration,
-        T5Model,
-        T5ModelWithHeads,
-        T5Tokenizer,
-    )
+    from transformers import ByT5Tokenizer, T5EncoderModel, T5ForConditionalGeneration, T5Model, T5Tokenizer
     from transformers.models.t5.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
@@ -515,7 +507,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
 
-    all_model_classes = (T5Model, T5ForConditionalGeneration, T5ModelWithHeads) if is_torch_available() else ()
+    all_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else ()
     fx_ready_model_classes = all_model_classes
     all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
diff --git a/tests_adapters/__init__.py b/tests_adapters/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests_adapters/conftest.py b/tests_adapters/conftest.py
new file mode 100644
index 000000000..03c9ed131
--- /dev/null
+++ b/tests_adapters/conftest.py
@@ -0,0 +1,65 @@
+# Copyright 2022 The AdapterHub Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tests directory-specific settings - this file is run automatically
+# by pytest before any tests are run
+
+import sys
+import warnings
+from os.path import abspath, dirname, join
+
+
+# allow having multiple repository checkouts and not needing to remember to rerun
+# 'pip install -e .[dev]' when switching between checkouts and running tests.
+git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
+sys.path.insert(1, git_repo_path)
+
+# add original tests of HF to path
+hf_tests_path = abspath(join(dirname(dirname(__file__))))
+sys.path.insert(1, hf_tests_path)
+
+# silence FutureWarning warnings in tests since often we can't act on them until
+# they become normal warnings - i.e. the tests still need to test the current functionality
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipeline are tested")
+    config.addinivalue_line(
+        "markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
+    )
+    config.addinivalue_line(
+        "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
+    )
+    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
+
+
+def pytest_addoption(parser):
+    from transformers.testing_utils import pytest_addoption_shared
+
+    pytest_addoption_shared(parser)
+
+
+def pytest_terminal_summary(terminalreporter):
+    from transformers.testing_utils import pytest_terminal_summary_main
+
+    make_reports = terminalreporter.config.getoption("--make-reports")
+    if make_reports:
+        pytest_terminal_summary_main(terminalreporter, id=make_reports)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    # If no tests are collected, pytest exists with code 5, which makes the CI fail.
+    if exitstatus == 5:
+        session.exitstatus = 0
diff --git a/tests/extended/test_adapter_trainer_ext.py b/tests_adapters/extended/test_adapter_trainer_ext.py
similarity index 99%
rename from tests/extended/test_adapter_trainer_ext.py
rename to tests_adapters/extended/test_adapter_trainer_ext.py
index 597f56fbb..4ef24281a 100644
--- a/tests/extended/test_adapter_trainer_ext.py
+++ b/tests_adapters/extended/test_adapter_trainer_ext.py
@@ -230,7 +230,7 @@ def run_trainer(
         do_eval: bool = True,
         do_predict: bool = True,
     ):
-        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
+        data_dir = self.test_file_dir / "../../tests/fixtures/tests_samples/wmt_en_ro"
         output_dir = self.get_auto_remove_tmp_dir()
         args_train = f"""
             --model_name_or_path {model_name}
diff --git a/tests/fixtures/SiBERT/config.json b/tests_adapters/fixtures/SiBERT/config.json
similarity index 100%
rename from tests/fixtures/SiBERT/config.json
rename to tests_adapters/fixtures/SiBERT/config.json
diff --git a/tests/fixtures/SiBERT/special_tokens_map.json b/tests_adapters/fixtures/SiBERT/special_tokens_map.json
similarity index 100%
rename from tests/fixtures/SiBERT/special_tokens_map.json
rename to tests_adapters/fixtures/SiBERT/special_tokens_map.json
diff --git a/tests/fixtures/SiBERT/tokenizer_config.json b/tests_adapters/fixtures/SiBERT/tokenizer_config.json
similarity index 100%
rename from tests/fixtures/SiBERT/tokenizer_config.json
rename to tests_adapters/fixtures/SiBERT/tokenizer_config.json
diff --git a/tests/fixtures/SiBERT/vocab.txt b/tests_adapters/fixtures/SiBERT/vocab.txt
similarity index 100%
rename from tests/fixtures/SiBERT/vocab.txt
rename to tests_adapters/fixtures/SiBERT/vocab.txt
diff --git a/tests/fixtures/hub-index.sample.json b/tests_adapters/fixtures/hub-index.sample.json
similarity index 100%
rename from tests/fixtures/hub-index.sample.json
rename to tests_adapters/fixtures/hub-index.sample.json
diff --git a/tests_adapters/test_adapter.py b/tests_adapters/test_adapter.py
new file mode 100644
index 000000000..c381f86ba
--- /dev/null
+++ b/tests_adapters/test_adapter.py
@@ -0,0 +1,51 @@
+import random
+
+import torch
+
+from transformers import AutoModel, GlueDataset, GlueDataTrainingArguments
+from transformers.testing_utils import torch_device
+
+
+def make_config(config_class, **kwargs):
+    return staticmethod(lambda: config_class(**kwargs))
+
+
+class AdapterTestBase:
+    # If not overriden by subclass, AutoModel should be used.
+    model_class = AutoModel
+
+    def get_model(self):
+        if self.model_class == AutoModel:
+            model = AutoModel.from_config(self.config())
+        else:
+            model = self.model_class(self.config())
+        model.to(torch_device)
+        return model
+
+    def get_input_samples(self, shape, vocab_size=5000, config=None):
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(random.randint(0, vocab_size - 1))
+        input_ids = torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
+        # this is needed e.g. for BART
+        if config and config.eos_token_id is not None and config.eos_token_id < vocab_size:
+            input_ids[input_ids == config.eos_token_id] = random.randint(0, config.eos_token_id - 1)
+            input_ids[:, -1] = config.eos_token_id
+        in_data = {"input_ids": input_ids}
+
+        if config and config.is_encoder_decoder:
+            in_data["decoder_input_ids"] = input_ids.clone()
+        return in_data
+
+    def add_head(self, model, name, **kwargs):
+        model.add_classification_head(name, **kwargs)
+
+    def dataset(self, tokenizer):
+        data_args = GlueDataTrainingArguments(
+            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
+        )
+        return GlueDataset(data_args, tokenizer=tokenizer, mode="train")
diff --git a/tests/test_adapter_common.py b/tests_adapters/test_adapter_common.py
similarity index 96%
rename from tests/test_adapter_common.py
rename to tests_adapters/test_adapter_common.py
index caf95cd8b..0a33d9231 100644
--- a/tests/test_adapter_common.py
+++ b/tests_adapters/test_adapter_common.py
@@ -5,10 +5,10 @@
 
 from transformers import (
     ADAPTER_CONFIG_MAP,
+    ADAPTER_MODEL_MAPPING,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-    MODEL_WITH_HEADS_MAPPING,
     AdapterSetup,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     HoulsbyConfig,
     HoulsbyInvConfig,
     PfeifferConfig,
@@ -259,12 +259,12 @@ def test_model_config_serialization(self):
             model.config.to_json_string()
 
     def test_loading_adapter_weights_with_prefix(self):
-        if self.config_class not in MODEL_WITH_HEADS_MAPPING:
+        if self.config_class not in ADAPTER_MODEL_MAPPING:
             self.skipTest("Does not support flex heads.")
 
         model_base, model_with_head_base = create_twin_models(self.model_class, self.config)
 
-        model_with_head = AutoModelWithHeads.from_config(model_with_head_base.config)
+        model_with_head = AutoAdapterModel.from_config(model_with_head_base.config)
         setattr(model_with_head, model_with_head.base_model_prefix, model_with_head_base)
 
         model_with_head.add_adapter("dummy")
@@ -288,12 +288,12 @@ def test_loading_adapter_weights_with_prefix(self):
         self.assertTrue(torch.equal(output1[0], output2[0]))
 
     def test_loading_adapter_weights_without_prefix(self):
-        if self.config_class not in MODEL_WITH_HEADS_MAPPING:
+        if self.config_class not in ADAPTER_MODEL_MAPPING:
             self.skipTest("Does not support flex heads.")
 
         model_base, model_with_head_base = create_twin_models(self.model_class, self.config)
 
-        model_with_head = AutoModelWithHeads.from_config(model_with_head_base.config)
+        model_with_head = AutoAdapterModel.from_config(model_with_head_base.config)
         setattr(model_with_head, model_with_head.base_model_prefix, model_with_head_base)
 
         model_base.add_adapter("dummy")
@@ -317,13 +317,13 @@ def test_loading_adapter_weights_without_prefix(self):
         self.assertTrue(torch.equal(output1[0], output2[0]))
 
     def test_forward_with_past(self):
-        if self.config_class not in MODEL_WITH_HEADS_MAPPING:
+        if self.config_class not in ADAPTER_MODEL_MAPPING:
             self.skipTest("Does not support flex heads.")
         if self.config_class not in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING:
             self.skipTest("No causal lm class.")
 
         static_model = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING[self.config_class](self.config())
-        flex_model = AutoModelWithHeads.from_pretrained(
+        flex_model = AutoAdapterModel.from_pretrained(
             None, config=self.config(), state_dict=static_model.state_dict()
         )
         static_model.add_adapter("dummy")
diff --git a/tests/test_adapter_composition.py b/tests_adapters/test_adapter_composition.py
similarity index 95%
rename from tests/test_adapter_composition.py
rename to tests_adapters/test_adapter_composition.py
index d51e67626..ca24bc321 100644
--- a/tests/test_adapter_composition.py
+++ b/tests_adapters/test_adapter_composition.py
@@ -4,21 +4,21 @@
 
 import torch
 
-from tests.test_adapter_training import filter_parameters
+from tests.test_modeling_common import ids_tensor
 from transformers import (
     MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     AutoTokenizer,
     BertConfig,
     BertForSequenceClassification,
-    T5ModelWithHeads,
+    T5AdapterModel,
     Trainer,
     TrainingArguments,
 )
 from transformers.adapters.composition import BatchSplit, Fuse, Parallel, Split, Stack, parse_composition
 from transformers.testing_utils import require_torch, torch_device
 
-from .test_modeling_common import ids_tensor
+from .test_adapter_training import filter_parameters
 
 
 class AdapterCompositionParsingTest(unittest.TestCase):
@@ -146,7 +146,7 @@ def test_batch_split_equivalent(self):
 @require_torch
 class ParallelAdapterInferenceTestMixin:
     def test_parallel_inference_with_heads(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
 
         model.add_adapter("a")
         model.add_adapter("b")
@@ -179,7 +179,7 @@ def test_parallel_inference_with_heads(self):
         self.assertTrue(torch.allclose(outputs[1][0], outputs_b[0], atol=1e-5))
 
     def test_parallel_inference_with_wrong_number_of_heads(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.eval()
 
         model.add_adapter("a")
@@ -199,7 +199,7 @@ def test_parallel_inference_with_wrong_number_of_heads(self):
             model(**inputs)
 
     def test_batch_split_with_heads(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_adapter("a")
         model.add_adapter("b")
         self.add_head(model, "a", num_labels=2)
@@ -208,7 +208,7 @@ def test_batch_split_with_heads(self):
         model.to(torch_device)
 
         inputs = {"input_ids": self.get_input_samples((2, 128), config=model.config)["input_ids"]}
-        if isinstance(model, T5ModelWithHeads):
+        if isinstance(model, T5AdapterModel):
             inputs["decoder_input_ids"] = inputs["input_ids"]
 
         # for reference, pass through single adapters
@@ -280,7 +280,7 @@ def test_parallel_training(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
 
         model.add_adapter("mrpc1")
         model.add_adapter("mrpc2")
@@ -322,7 +322,7 @@ def test_parallel_training(self):
                 self.assertTrue(torch.equal(v1, v2))
 
     def test_parallel_training_equivalent_to_single_adapters(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.eval()
 
         a1, a2 = self.create_twin_adapters(model, "a")
@@ -331,7 +331,7 @@ def test_parallel_training_equivalent_to_single_adapters(self):
         dataset = []
         for i in range(3):
             input_data = self.get_input_samples((3, 128), config=model.config)
-            if isinstance(model, T5ModelWithHeads):
+            if isinstance(model, T5AdapterModel):
                 input_data["labels"] = torch.randint(0, 2, (3, 128))
             else:
                 input_data["labels"] = torch.randint(0, 2, (3, 1))
@@ -362,7 +362,7 @@ def test_parallel_training_equivalent_to_single_adapters(self):
                 self.assertTrue(torch.allclose(v, state_dict[k.replace(b1, b2)], atol=1e-5))
 
     def test_parallel_training_single_forward_pass(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.eval()
 
         a1, a2 = self.create_twin_adapters(model, "a")
@@ -376,7 +376,7 @@ def test_parallel_training_single_forward_pass(self):
                 self.assertTrue(torch.equal(v, state_dict[k.replace(b1, b2)]))
 
         input_data = self.get_input_samples((3, 128), config=model.config)
-        if isinstance(model, T5ModelWithHeads):
+        if isinstance(model, T5AdapterModel):
             input_data["labels"] = torch.randint(0, 2, (3, 128), device=torch_device)
         else:
             input_data["labels"] = torch.randint(0, 2, (3, 1), device=torch_device)
diff --git a/tests/test_adapter_config.py b/tests_adapters/test_adapter_config.py
similarity index 100%
rename from tests/test_adapter_config.py
rename to tests_adapters/test_adapter_config.py
diff --git a/tests/test_adapter_conversion.py b/tests_adapters/test_adapter_conversion.py
similarity index 98%
rename from tests/test_adapter_conversion.py
rename to tests_adapters/test_adapter_conversion.py
index 7d37fffcb..f44a54fe3 100644
--- a/tests/test_adapter_conversion.py
+++ b/tests_adapters/test_adapter_conversion.py
@@ -12,7 +12,7 @@
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
     MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     BertPreTrainedModel,
     RobertaPreTrainedModel,
 )
@@ -26,7 +26,7 @@ class ModelClassConversionTestMixin:
     seq_length = 128
 
     def run_test(self, static_model, input_shape=None, label_dict=None):
-        flex_model = AutoModelWithHeads.from_pretrained(
+        flex_model = AutoAdapterModel.from_pretrained(
             None, config=self.config(), state_dict=static_model.state_dict()
         )
         static_model.eval()
@@ -147,7 +147,7 @@ def test_equivalent_language_generation(self):
             self.skipTest("no causal lm class.")
 
         static_model = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING[self.config_class](self.config())
-        flex_model = AutoModelWithHeads.from_pretrained(
+        flex_model = AutoAdapterModel.from_pretrained(
             None, config=self.config(), state_dict=static_model.state_dict()
         )
         static_model.add_adapter("dummy")
diff --git a/tests/test_adapter_custom_head.py b/tests_adapters/test_adapter_custom_head.py
similarity index 88%
rename from tests/test_adapter_custom_head.py
rename to tests_adapters/test_adapter_custom_head.py
index c3d9e703a..50f7a9b92 100644
--- a/tests/test_adapter_custom_head.py
+++ b/tests_adapters/test_adapter_custom_head.py
@@ -4,7 +4,7 @@
 import torch
 
 from tests.test_modeling_common import ids_tensor
-from transformers import AutoConfig, AutoModelWithHeads
+from transformers import AutoConfig, AutoAdapterModel
 from transformers.adapters.heads import ClassificationHead, PredictionHead
 from transformers.testing_utils import require_torch, torch_device
 
@@ -30,7 +30,7 @@ def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=Fal
 class AdapterCustomHeadTest(unittest.TestCase):
     def test_add_custom_head(self):
         model_name = "bert-base-uncased"
-        model = AutoModelWithHeads.from_pretrained(model_name)
+        model = AutoAdapterModel.from_pretrained(model_name)
         model.register_custom_head("tag", CustomHead)
         config = {"num_labels": 3, "layers": 2, "activation_function": "tanh"}
         model.add_custom_head(head_type="tag", head_name="custom_head", **config)
@@ -46,7 +46,7 @@ def test_add_custom_head(self):
     def test_custom_head_from_model_config(self):
         model_name = "bert-base-uncased"
         model_config = AutoConfig.from_pretrained(model_name, custom_heads={"tag": CustomHead})
-        model = AutoModelWithHeads.from_pretrained(model_name, config=model_config)
+        model = AutoAdapterModel.from_pretrained(model_name, config=model_config)
         config = {"num_labels": 3, "layers": 2, "activation_function": "tanh"}
         model.add_custom_head(head_type="tag", head_name="custom_head", **config)
         model.eval()
@@ -61,8 +61,8 @@ def test_custom_head_from_model_config(self):
     def test_save_load_custom_head(self):
         model_name = "bert-base-uncased"
         model_config = AutoConfig.from_pretrained(model_name, custom_heads={"tag": CustomHead})
-        model1 = AutoModelWithHeads.from_pretrained(model_name, config=model_config)
-        model2 = AutoModelWithHeads.from_pretrained(model_name, config=model_config)
+        model1 = AutoAdapterModel.from_pretrained(model_name, config=model_config)
+        model2 = AutoAdapterModel.from_pretrained(model_name, config=model_config)
         config = {"num_labels": 3, "layers": 2, "activation_function": "tanh"}
         model1.add_custom_head(head_type="tag", head_name="custom_head", **config)
 
@@ -87,7 +87,7 @@ def test_save_load_custom_head(self):
     def test_builtin_head_as_custom(self):
         model_name = "bert-base-uncased"
         model_config = AutoConfig.from_pretrained(model_name, custom_heads={"tag": CustomHead})
-        model = AutoModelWithHeads.from_pretrained(model_name, config=model_config)
+        model = AutoAdapterModel.from_pretrained(model_name, config=model_config)
         model.eval()
         in_data = ids_tensor((1, 128), 1000)
 
diff --git a/tests/test_adapter_embeddings.py b/tests_adapters/test_adapter_embeddings.py
similarity index 86%
rename from tests/test_adapter_embeddings.py
rename to tests_adapters/test_adapter_embeddings.py
index 568e561c4..c6ee78639 100644
--- a/tests/test_adapter_embeddings.py
+++ b/tests_adapters/test_adapter_embeddings.py
@@ -3,10 +3,11 @@
 
 import torch
 
-from tests.test_adapter_training import filter_parameters
-from transformers import AutoModelWithHeads, AutoTokenizer, Trainer, TrainingArguments
+from transformers import AutoAdapterModel, AutoTokenizer, Trainer, TrainingArguments
 from transformers.testing_utils import require_torch, torch_device
 
+from .test_adapter_training import filter_parameters
+
 
 @require_torch
 class EmbeddingTestMixin:
@@ -20,13 +21,13 @@ def test_load_embeddings(self):
 
     def test_add_embeddings(self):
         model = self.get_model()
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         model.add_embeddings("test", tokenizer)
         self.assertEqual(model.active_embeddings, "test")
 
     def test_delete_embeddings(self):
         model = self.get_model()
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         model.add_embeddings("test", tokenizer)
         self.assertEqual(model.active_embeddings, "test")
         model.delete_embeddings("test")
@@ -35,7 +36,7 @@ def test_delete_embeddings(self):
 
     def test_save_load_embedding(self):
         model = self.get_model()
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         input_data = self.get_input_samples((1, 128), vocab_size=tokenizer.vocab_size, config=model.config)
         model.add_embeddings("test", tokenizer)
         model.eval()
@@ -61,7 +62,7 @@ def test_back_to_default(self):
         model.eval()
         input_data = self.get_input_samples((1, 128), config=model.config)
         output1 = model(**input_data)
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         model.add_embeddings("test", tokenizer)
         self.assertEqual(model.active_embeddings, "test")
         model.set_active_embeddings("default")
@@ -70,11 +71,11 @@ def test_back_to_default(self):
         self.assertTrue(torch.equal(output1[0], output2[0]))
 
     def test_training_embedding(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
         model.add_adapter("test")
         self.add_head(model, "test")
         model.train_adapter("test", train_embeddings=True)
@@ -120,11 +121,11 @@ def test_training_embedding(self):
         )
 
     def test_reference_embedding(self):
-        model = AutoModelWithHeads.from_config(self.config())  # self.get_model()
+        model = AutoAdapterModel.from_config(self.config())  # self.get_model()
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        new_tokenizer = AutoTokenizer.from_pretrained("tests/fixtures/SiBERT")
+        new_tokenizer = AutoTokenizer.from_pretrained("tests_adapters/fixtures/SiBERT")
 
         model.add_embeddings("test", new_tokenizer, "default", tokenizer)
 
diff --git a/tests/test_adapter_fusion_common.py b/tests_adapters/test_adapter_fusion_common.py
similarity index 97%
rename from tests/test_adapter_fusion_common.py
rename to tests_adapters/test_adapter_fusion_common.py
index 2d6aa3d74..28893a035 100644
--- a/tests/test_adapter_fusion_common.py
+++ b/tests_adapters/test_adapter_fusion_common.py
@@ -6,10 +6,10 @@
 import torch
 
 from transformers import (
+    ADAPTER_MODEL_MAPPING,
     ADAPTERFUSION_CONFIG_MAP,
-    MODEL_WITH_HEADS_MAPPING,
     AdapterConfig,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     PfeifferConfig,
 )
 from transformers.adapters.composition import Fuse
@@ -163,9 +163,9 @@ def test_model_config_serialization_fusion(self):
             model.config.to_json_string()
 
     def test_adapter_fusion_save_with_head(self):
-        if self.config_class not in MODEL_WITH_HEADS_MAPPING:
+        if self.config_class not in ADAPTER_MODEL_MAPPING:
             self.skipTest("Does not support flex heads.")
-        model1 = AutoModelWithHeads.from_config(self.config())
+        model1 = AutoAdapterModel.from_config(self.config())
         model1.eval()
 
         name1 = "name1"
diff --git a/tests/test_adapter_fusion_config.py b/tests_adapters/test_adapter_fusion_config.py
similarity index 100%
rename from tests/test_adapter_fusion_config.py
rename to tests_adapters/test_adapter_fusion_config.py
diff --git a/tests/test_adapter_heads.py b/tests_adapters/test_adapter_heads.py
similarity index 83%
rename from tests/test_adapter_heads.py
rename to tests_adapters/test_adapter_heads.py
index 3ed4f1485..ae1c34d1e 100644
--- a/tests/test_adapter_heads.py
+++ b/tests_adapters/test_adapter_heads.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from transformers import MODEL_WITH_HEADS_MAPPING, AdapterSetup, AutoModelForSequenceClassification, AutoModelWithHeads
+from transformers import ADAPTER_MODEL_MAPPING, AdapterSetup, AutoModelForSequenceClassification, AutoAdapterModel
 from transformers.adapters.composition import BatchSplit, Stack
 from transformers.testing_utils import require_torch, torch_device
 
@@ -52,10 +52,10 @@ def run_prediction_head_test(
         self.assertTrue(torch.equal(output1[idx], output2[idx]))
 
     def test_classification_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_classification_head("dummy")
         label_dict = {}
@@ -63,10 +63,10 @@ def test_classification_head(self):
         self.run_prediction_head_test(model1, model2, "dummy", label_dict=label_dict)
 
     def test_multiple_choice_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_multiple_choice_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_multiple_choice_head"):
             self.skipTest("No multiple choice head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_multiple_choice_head("dummy")
         label_dict = {}
@@ -76,10 +76,10 @@ def test_multiple_choice_head(self):
         )
 
     def test_tagging_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_tagging_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_tagging_head"):
             self.skipTest("No tagging head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_tagging_head("dummy")
         label_dict = {}
@@ -89,10 +89,10 @@ def test_tagging_head(self):
         )
 
     def test_qa_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_qa_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_qa_head"):
             self.skipTest("No QA head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_qa_head("dummy")
         label_dict = {}
@@ -103,10 +103,10 @@ def test_qa_head(self):
         )
 
     def test_causal_lm_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_causal_lm_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_causal_lm_head"):
             self.skipTest("No causal language model head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
         model1.add_causal_lm_head("dummy")
 
         label_dict = {}
@@ -121,10 +121,10 @@ def test_causal_lm_head(self):
         )
 
     def test_seq2seq_lm_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_seq2seq_lm_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_seq2seq_lm_head"):
             self.skipTest("No seq2seq language model head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
         model1.add_seq2seq_lm_head("dummy")
 
         label_dict = {}
@@ -152,10 +152,10 @@ def test_seq2seq_lm_head(self):
         self.assertEqual(generated.shape, (1, seq_output_length))
 
     def test_masked_lm_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_masked_lm_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_masked_lm_head"):
             self.skipTest("No causal or seq2seq language model head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_masked_lm_head("dummy")
         label_dict = {}
@@ -169,10 +169,10 @@ def test_masked_lm_head(self):
         )
 
     def test_dependency_parsing_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_dependency_parsing_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_dependency_parsing_head"):
             self.skipTest("No dependency parsing head")
 
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         model1.add_dependency_parsing_head("dummy")
         label_dict = {}
@@ -190,7 +190,7 @@ def test_dependency_parsing_head(self):
         )
 
     def test_delete_head(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.eval()
 
         name = "test_head"
@@ -205,9 +205,9 @@ def test_delete_head(self):
         self.assertNotEqual(name, model.active_head)
 
     def test_adapter_with_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         name = "dummy"
         model1.add_adapter(name)
@@ -229,9 +229,9 @@ def test_adapter_with_head(self):
         self.assertEqual(3, output1[0].size()[1])
 
     def test_adapter_with_head_load_as(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
-        model1, model2 = create_twin_models(AutoModelWithHeads, self.config)
+        model1, model2 = create_twin_models(AutoAdapterModel, self.config)
 
         name = "dummy"
         model1.add_adapter(name)
@@ -255,7 +255,7 @@ def test_adapter_with_head_load_as(self):
         self.assertEqual(3, output1[0].size()[1])
 
     def test_load_full_model(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_classification_head("dummy", layers=1)
 
         true_config = model.get_prediction_heads_config()
@@ -263,14 +263,14 @@ def test_load_full_model(self):
             # save
             model.save_pretrained(temp_dir)
             # reload
-            model = AutoModelWithHeads.from_pretrained(temp_dir)
+            model = AutoAdapterModel.from_pretrained(temp_dir)
         self.assertIn("dummy", model.heads)
         self.assertDictEqual(true_config, model.get_prediction_heads_config())
 
     def test_batch_split_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_classification_head("a")
         model.add_classification_head("b")
         model.active_head = BatchSplit("a", "b", batch_sizes=[1, 2])
@@ -284,7 +284,7 @@ def test_batch_split_head(self):
         self.assertEqual((2, 2), out[1][0].shape)
 
     def test_batch_split_adapter_head(self):
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         self.add_head(model, "a")
         self.add_head(model, "b")
         model.add_adapter("a")
@@ -300,10 +300,10 @@ def test_batch_split_adapter_head(self):
         self.assertTrue(isinstance(model.active_head, BatchSplit))
 
     def test_reload_static_to_flex_head(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
         static_head_model = AutoModelForSequenceClassification.from_config(self.config())
-        flex_head_model = AutoModelWithHeads.from_pretrained(
+        flex_head_model = AutoAdapterModel.from_pretrained(
             None, config=self.config(), state_dict=static_head_model.state_dict()
         )
         static_head_model.eval()
@@ -336,16 +336,16 @@ def test_reload_static_to_flex_head(self):
         self.assertTrue(torch.all(torch.isclose(output1.logits, output2.logits)))
 
     def test_invertible_adapter_with_head(self):
-        if hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_masked_lm_head"):
+        if hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_masked_lm_head"):
             lm_head = "masked_lm"
-        elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_causal_lm_head"):
+        elif hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_causal_lm_head"):
             lm_head = "casual_lm"
-        elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_seq2seq_lm_head"):
+        elif hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_seq2seq_lm_head"):
             lm_head = "seq2seq_lm"
         else:
             self.skipTest("No masked or causel language model head")
 
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_adapter("test", config="pfeiffer+inv")
         if lm_head == "casual_lm":
             model.add_causal_lm_head("test")
@@ -377,9 +377,9 @@ def forward_pre_hook(module, input):
         self.assertEqual(2, calls)
 
     def test_context_simple(self):
-        if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"):
+        if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_classification_head"):
             self.skipTest("No classification head available")
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         model.add_adapter("a")
         model.add_classification_head("a", num_labels=3)
         # Make sure no adapter is activated
diff --git a/tests/test_adapter_hub.py b/tests_adapters/test_adapter_hub.py
similarity index 98%
rename from tests/test_adapter_hub.py
rename to tests_adapters/test_adapter_hub.py
index bd02cd8b9..150198df0 100644
--- a/tests/test_adapter_hub.py
+++ b/tests_adapters/test_adapter_hub.py
@@ -3,13 +3,14 @@
 
 import numpy as np
 
+from tests.test_modeling_common import ids_tensor
 from transformers import (  # get_adapter_config_hash,
     ADAPTER_CONFIG_MAP,
     AdapterConfig,
     AutoModel,
     AutoTokenizer,
+    BertAdapterModel,
     BertForSequenceClassification,
-    BertModelWithHeads,
     GlueDataset,
     GlueDataTrainingArguments,
     TrainingArguments,
@@ -20,8 +21,6 @@
 from transformers.adapters.utils import find_in_index
 from transformers.testing_utils import require_torch, torch_device
 
-from .test_modeling_common import ids_tensor
-
 
 SAMPLE_INDEX = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/hub-index.sample.json")
 
@@ -140,7 +139,7 @@ def test_load_lang_adapter_from_hub(self):
                 self.assertEqual([1, 128, 768], list(output[0].size()))
 
     def test_load_adapter_with_head_from_hub(self):
-        model = BertModelWithHeads.from_pretrained("bert-base-uncased")
+        model = BertAdapterModel.from_pretrained("bert-base-uncased")
 
         loading_info = {}
         adapter_name = model.load_adapter(
diff --git a/tests/test_adapter_save_id2label.py b/tests_adapters/test_adapter_save_id2label.py
similarity index 94%
rename from tests/test_adapter_save_id2label.py
rename to tests_adapters/test_adapter_save_id2label.py
index 95b94bf37..5807b93c2 100644
--- a/tests/test_adapter_save_id2label.py
+++ b/tests_adapters/test_adapter_save_id2label.py
@@ -2,7 +2,7 @@
 from tempfile import TemporaryDirectory
 from typing import Dict
 
-from transformers import BertConfig, BertForSequenceClassification, BertModelWithHeads
+from transformers import BertAdapterModel, BertConfig, BertForSequenceClassification
 
 
 def get_default(num_label):
@@ -62,7 +62,7 @@ def test_sequ_classification_model_head_labels(self):
         self.assertDictEqual(self.label_map, model.get_labels_dict())
 
     def test_model_with_heads_tagging_head_labels(self):
-        model = BertModelWithHeads(self.config)
+        model = BertAdapterModel(self.config)
         model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map)
         with TemporaryDirectory() as temp_dir:
             model.save_head(temp_dir, "test_head")
@@ -74,7 +74,7 @@ def test_model_with_heads_tagging_head_labels(self):
         self.assertDictEqual(self.label_map, model.get_labels_dict())
 
     def test_multiple_heads_label(self):
-        model = BertModelWithHeads(self.config)
+        model = BertAdapterModel(self.config)
         model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map)
         with TemporaryDirectory() as temp_dir:
             model.save_head(temp_dir, "test_head")
@@ -88,7 +88,7 @@ def test_multiple_heads_label(self):
         self.assertEqual(model.get_labels_dict("classification_head"), default_label_dict)
 
     def test_model_with_heads_multiple_heads(self):
-        model = BertModelWithHeads(self.config)
+        model = BertAdapterModel(self.config)
         model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map)
         model.add_classification_head("second_head", num_labels=5)
         with TemporaryDirectory() as temp_dir:
diff --git a/tests/test_adapter_setup_context.py b/tests_adapters/test_adapter_setup_context.py
similarity index 94%
rename from tests/test_adapter_setup_context.py
rename to tests_adapters/test_adapter_setup_context.py
index f9a00ff7a..6d96535bd 100644
--- a/tests/test_adapter_setup_context.py
+++ b/tests_adapters/test_adapter_setup_context.py
@@ -2,7 +2,7 @@
 from threading import Thread
 
 from tests.test_modeling_common import ids_tensor
-from transformers import AdapterSetup, AutoModelWithHeads, BertConfig
+from transformers import AdapterSetup, AutoAdapterModel, BertConfig
 from transformers.testing_utils import require_torch, torch_device
 
 
@@ -17,7 +17,7 @@ def setUp(self):
         )
 
     def test_context_nested(self):
-        model = AutoModelWithHeads.from_config(self.config)
+        model = AutoAdapterModel.from_config(self.config)
         model.add_adapter("a")
         model.add_classification_head("a", num_labels=2)
         model.add_adapter("b")
@@ -57,7 +57,7 @@ def forward_pre_hook_b(module, input):
         self.assertEqual(calls_b, 1)
 
     def test_context_multi_threading(self):
-        model = AutoModelWithHeads.from_config(self.config)
+        model = AutoAdapterModel.from_config(self.config)
         model.add_adapter("a")
         model.add_classification_head("a", num_labels=2)
         model.add_adapter("b")
diff --git a/tests/test_adapter_trainer.py b/tests_adapters/test_adapter_trainer.py
similarity index 98%
rename from tests/test_adapter_trainer.py
rename to tests_adapters/test_adapter_trainer.py
index 613522163..8f2e1b6f6 100644
--- a/tests/test_adapter_trainer.py
+++ b/tests_adapters/test_adapter_trainer.py
@@ -6,7 +6,7 @@
 
 from transformers import (
     AutoModelForSequenceClassification,
-    AutoModelWithHeads,
+    AutoAdapterModel,
     AutoTokenizer,
     BertConfig,
     BertForSequenceClassification,
@@ -221,7 +221,7 @@ def test_reloading_prediction_head(self):
         )
         train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
 
-        model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
+        model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 
         model.add_classification_head("adapter", num_labels=3)
         model.add_classification_head("dummy", num_labels=2)
@@ -255,7 +255,7 @@ def test_reloading_prediction_head(self):
 
             trainer.train()
             # create second model that should resume the training of the first
-            model_resume = AutoModelWithHeads.from_pretrained("bert-base-uncased")
+            model_resume = AutoAdapterModel.from_pretrained("bert-base-uncased")
 
             model_resume.add_classification_head("adapter", num_labels=3)
             model_resume.add_classification_head("dummy", num_labels=2)
@@ -290,7 +290,7 @@ def test_general(self):
         )
         train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
 
-        model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
+        model = AutoAdapterModel.from_pretrained("bert-base-uncased")
 
         model.add_classification_head("task", num_labels=3)
 
diff --git a/tests/test_adapter_training.py b/tests_adapters/test_adapter_training.py
similarity index 96%
rename from tests/test_adapter_training.py
rename to tests_adapters/test_adapter_training.py
index 4957184ee..66e6db441 100644
--- a/tests/test_adapter_training.py
+++ b/tests_adapters/test_adapter_training.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from transformers import AutoModelWithHeads, AutoTokenizer, TrainingArguments
+from transformers import AutoAdapterModel, AutoTokenizer, TrainingArguments
 from transformers.adapters.composition import BatchSplit, Fuse
 from transformers.adapters.trainer import AdapterTrainer as Trainer
 from transformers.testing_utils import require_torch
@@ -38,7 +38,7 @@ def test_train_single_adapter(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
 
         # add two adapters: one will be trained and the other should be frozen
         model.add_adapter("mrpc")
@@ -76,7 +76,7 @@ def test_train_adapter_fusion(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
         self.add_head(model, "head")
 
         # add the adapters to be fused
@@ -138,7 +138,7 @@ def test_batch_split_training(self):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelWithHeads.from_config(self.config())
+        model = AutoAdapterModel.from_config(self.config())
 
         model.add_adapter("mrpc1")
         model.add_adapter("mrpc2")
diff --git a/tests_adapters/test_bart.py b/tests_adapters/test_bart.py
new file mode 100644
index 000000000..9990b914e
--- /dev/null
+++ b/tests_adapters/test_bart.py
@@ -0,0 +1,61 @@
+import unittest
+
+from tests.test_modeling_bart import *
+from transformers import BartAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class BartAdapterModelTest(AdapterModelTesterMixin, BartModelTest):
+    all_model_classes = (
+        BartAdapterModel,
+    )
+
+
+class BartAdapterTestBase(AdapterTestBase):
+    config_class = BartConfig
+    config = make_config(
+        BartConfig,
+        d_model=16,
+        encoder_layers=2,
+        decoder_layers=2,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        encoder_ffn_dim=4,
+        decoder_ffn_dim=4,
+    )
+    tokenizer_name = "facebook/bart-base"
+
+
+@require_torch
+class BartAdapterTest(
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    EmbeddingTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTrainingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    BartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class BartClassConversionTest(
+    ModelClassConversionTestMixin,
+    BartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_bert.py b/tests_adapters/test_bert.py
new file mode 100644
index 000000000..57d0d54ac
--- /dev/null
+++ b/tests_adapters/test_bert.py
@@ -0,0 +1,58 @@
+import unittest
+
+from tests.test_modeling_bert import *
+from transformers import BertAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class BertAdapterModelTest(AdapterModelTesterMixin, BertModelTest):
+    all_model_classes = (
+        BertAdapterModel,
+    )
+
+
+class BertAdapterTestBase(AdapterTestBase):
+    config_class = BertConfig
+    config = make_config(
+        BertConfig,
+        hidden_size=32,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        intermediate_size=37,
+    )
+    tokenizer_name = "bert-base-uncased"
+
+
+@require_torch
+class BertAdapterTest(
+    EmbeddingTestMixin,
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTrainingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    BertAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class BertClassConversionTest(
+    ModelClassConversionTestMixin,
+    BertAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_common.py b/tests_adapters/test_common.py
new file mode 100644
index 000000000..272f17609
--- /dev/null
+++ b/tests_adapters/test_common.py
@@ -0,0 +1,13 @@
+from transformers.testing_utils import require_torch
+
+
+@require_torch
+class AdapterModelTesterMixin:
+    def test_training(self):
+        self.skipTest("Not applicable.")
+
+    def test_training_gradient_checkpointing(self):
+        self.skipTest("Not applicable.")
+
+    def test_correct_missing_keys(self):
+        self.skipTest("Not applicable.")
diff --git a/tests_adapters/test_distilbert.py b/tests_adapters/test_distilbert.py
new file mode 100644
index 000000000..bdba3dbfd
--- /dev/null
+++ b/tests_adapters/test_distilbert.py
@@ -0,0 +1,58 @@
+import unittest
+
+from tests.test_modeling_distilbert import *
+from transformers import DistilBertAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class DistilBertAdapterModelTest(AdapterModelTesterMixin, DistilBertModelTest):
+    all_model_classes = (
+        DistilBertAdapterModel,
+    )
+
+
+class DistilBertAdapterTestBase(AdapterTestBase):
+    config_class = DistilBertConfig
+    config = make_config(
+        DistilBertConfig,
+        dim=32,
+        n_layers=4,
+        n_heads=4,
+        hidden_dim=37,
+    )
+    tokenizer_name = "distilbert-base-uncased"
+
+
+@require_torch
+class DistilBertAdapterTest(
+    AdapterModelTestMixin,
+    EmbeddingTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTrainingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    DistilBertAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class DistilBertClassConversionTest(
+    ModelClassConversionTestMixin,
+    DistilBertAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_encoder_decoder.py b/tests_adapters/test_encoder_decoder.py
new file mode 100644
index 000000000..55532003b
--- /dev/null
+++ b/tests_adapters/test_encoder_decoder.py
@@ -0,0 +1,65 @@
+import unittest
+
+from tests.test_modeling_encoder_decoder import *  # Imported to execute model tests
+from transformers import AutoModelForSeq2SeqLM, BertConfig
+
+from .test_adapter import AdapterTestBase
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+
+
+class EncoderDecoderAdapterTestBase(AdapterTestBase):
+    model_class = EncoderDecoderModel
+    config_class = EncoderDecoderConfig
+    config = staticmethod(
+        lambda: EncoderDecoderConfig.from_encoder_decoder_configs(
+            BertConfig(
+                hidden_size=32,
+                num_hidden_layers=4,
+                num_attention_heads=4,
+                intermediate_size=37,
+            ),
+            BertConfig(
+                hidden_size=32,
+                num_hidden_layers=4,
+                num_attention_heads=4,
+                intermediate_size=37,
+                is_decoder=True,
+                add_cross_attention=True,
+            ),
+        )
+    )
+    tokenizer_name = "bert-base-uncased"
+
+
+@require_torch
+class EncoderDecoderAdapterTest(
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    EncoderDecoderAdapterTestBase,
+    unittest.TestCase,
+):
+    def test_invertible_adapter_with_head(self):
+        """This test class is copied and adapted from the identically-named test in test_adapter_heads.py."""
+        model = AutoModelForSeq2SeqLM.from_config(self.config())
+        model.add_adapter("test", config="pfeiffer+inv")
+        model.set_active_adapters("test")
+
+        # Set a hook before the invertible adapter to make sure it's actually called twice:
+        # Once after the embedding layer and once in the prediction head.
+        calls = 0
+
+        def forward_pre_hook(module, input):
+            nonlocal calls
+            calls += 1
+
+        inv_adapter = model.base_model.get_invertible_adapter()
+        self.assertIsNotNone(inv_adapter)
+        inv_adapter.register_forward_pre_hook(forward_pre_hook)
+
+        in_data = self.get_input_samples((1, 128), config=model.config)
+        model.to(torch_device)
+        out = model(**in_data)
+
+        self.assertEqual((1, 128, model.config.decoder.vocab_size), out[0].shape)
+        self.assertEqual(2, calls)
diff --git a/tests_adapters/test_gpt2.py b/tests_adapters/test_gpt2.py
new file mode 100644
index 000000000..eac05aed0
--- /dev/null
+++ b/tests_adapters/test_gpt2.py
@@ -0,0 +1,59 @@
+import unittest
+
+from tests.test_modeling_gpt2 import *
+from transformers import GPT2AdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class GPT2AdapterModelTest(AdapterModelTesterMixin, GPT2ModelTest):
+    all_model_classes = (
+        GPT2AdapterModel,
+    )
+
+
+class GPT2AdapterTestBase(AdapterTestBase):
+    config_class = GPT2Config
+    config = make_config(
+        GPT2Config,
+        n_embd=32,
+        n_layer=4,
+        n_head=4,
+        # set pad token to eos token
+        pad_token_id=50256,
+    )
+    tokenizer_name = "gpt2"
+
+
+@require_torch
+class GPT2AdapterTest(
+    AdapterModelTestMixin,
+    EmbeddingTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTrainingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    GPT2AdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class GPT2ClassConversionTest(
+    ModelClassConversionTestMixin,
+    GPT2AdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_mbart.py b/tests_adapters/test_mbart.py
new file mode 100644
index 000000000..416c5b1f4
--- /dev/null
+++ b/tests_adapters/test_mbart.py
@@ -0,0 +1,55 @@
+import unittest
+
+from tests.test_modeling_mbart import *
+from transformers import MBartAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class MBartAdapterModelTest(AdapterModelTesterMixin, MBartModelTest):
+    all_model_classes = (
+        MBartAdapterModel,
+    )
+
+
+class MBartAdapterTestBase(AdapterTestBase):
+    config_class = MBartConfig
+    config = make_config(
+        MBartConfig,
+        d_model=16,
+        encoder_layers=2,
+        decoder_layers=2,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        encoder_ffn_dim=4,
+        decoder_ffn_dim=4,
+    )
+
+
+@require_torch
+class MBartAdapterTest(
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    MBartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class MBartClassConversionTest(
+    ModelClassConversionTestMixin,
+    MBartAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_roberta.py b/tests_adapters/test_roberta.py
new file mode 100644
index 000000000..bb4af5be6
--- /dev/null
+++ b/tests_adapters/test_roberta.py
@@ -0,0 +1,52 @@
+import unittest
+
+from tests.test_modeling_roberta import *
+from transformers import RobertaAdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class RobertaAdapterModelTest(AdapterModelTesterMixin, RobertaModelTest):
+    all_model_classes = (
+        RobertaAdapterModel,
+    )
+
+
+class RobertaAdapterTestBase(AdapterTestBase):
+    config_class = RobertaConfig
+    config = make_config(
+        RobertaConfig,
+        hidden_size=32,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        intermediate_size=37,
+    )
+
+
+@require_torch
+class RobertaAdapterTest(
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    PredictionHeadModelTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    RobertaAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class RobertaClassConversionTest(
+    ModelClassConversionTestMixin,
+    RobertaAdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_t5.py b/tests_adapters/test_t5.py
new file mode 100644
index 000000000..41067c6cc
--- /dev/null
+++ b/tests_adapters/test_t5.py
@@ -0,0 +1,102 @@
+import unittest
+
+from datasets import load_dataset
+
+from tests.test_modeling_t5 import *
+from transformers import T5AdapterModel
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_common import AdapterModelTestMixin
+from .test_adapter_composition import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
+from .test_adapter_conversion import ModelClassConversionTestMixin
+from .test_adapter_embeddings import EmbeddingTestMixin
+from .test_adapter_fusion_common import AdapterFusionModelTestMixin
+from .test_adapter_heads import PredictionHeadModelTestMixin
+from .test_adapter_training import AdapterTrainingTestMixin
+from .test_common import AdapterModelTesterMixin
+
+
+@require_torch
+class T5AdapterModelTest(AdapterModelTesterMixin, T5ModelTest):
+    all_model_classes = (
+        T5AdapterModel,
+    )
+
+
+@require_torch
+class T5AdapterTestBase(AdapterTestBase):
+    config_class = T5Config
+    config = make_config(
+        T5Config,
+        d_model=16,
+        num_layers=2,
+        num_decoder_layers=2,
+        num_heads=4,
+        d_ff=4,
+        d_kv=16 // 4,
+        tie_word_embeddings=False,
+        decoder_start_token_id=0,
+    )
+    tokenizer_name = "t5-base"
+
+    def add_head(self, model, name, **kwargs):
+        model.add_seq2seq_lm_head(name)
+
+    def dataset(self, tokenizer):
+        def preprocess_function(examples):
+            inputs = examples["document"]
+            targets = examples["summary"]
+            inputs = ["Summarize: " + inp for inp in inputs]
+            model_inputs = tokenizer(inputs, padding=True, truncation=True)
+
+            # Setup the tokenizer for targets
+            with tokenizer.as_target_tokenizer():
+                labels = tokenizer(targets, padding=True, truncation=True)
+
+            # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+            # padding in the loss.
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+
+            model_inputs["labels"] = labels["input_ids"]
+            return model_inputs
+
+        data_args = {
+            "task_name": "xsum",
+            "path": "./tests/fixtures/tests_samples/xsum/sample.json",
+        }
+        dataset = load_dataset("json", data_files=data_args["path"])
+        train_dataset = dataset["train"]
+        train_dataset = train_dataset.map(
+            preprocess_function,
+            batched=True,
+            desc="Running tokenizer on train dataset",
+        )
+        return train_dataset
+
+
+@require_torch
+class T5AdapterTest(
+    T5AdapterTestBase,
+    EmbeddingTestMixin,
+    ParallelAdapterInferenceTestMixin,
+    ParallelTrainingMixin,
+    AdapterModelTestMixin,
+    AdapterFusionModelTestMixin,
+    AdapterTrainingTestMixin,
+    PredictionHeadModelTestMixin,
+    AdapterTestBase,
+    unittest.TestCase,
+):
+    pass
+
+
+@require_torch
+class T5ClassConversionTest(
+    ModelClassConversionTestMixin,
+    T5AdapterTestBase,
+    unittest.TestCase,
+):
+    pass
diff --git a/tests_adapters/test_xlm_roberta.py b/tests_adapters/test_xlm_roberta.py
new file mode 100644
index 000000000..b141814df
--- /dev/null
+++ b/tests_adapters/test_xlm_roberta.py
@@ -0,0 +1,24 @@
+import unittest
+
+from transformers import XLMRobertaConfig
+from transformers.testing_utils import require_torch
+
+from .test_adapter import AdapterTestBase, make_config
+from .test_adapter_conversion import ModelClassConversionTestMixin
+
+
+@require_torch
+class XLMRobertaClassConversionTest(
+    ModelClassConversionTestMixin,
+    AdapterTestBase,
+    unittest.TestCase,
+):
+    config_class = XLMRobertaConfig
+    config = make_config(
+        XLMRobertaConfig,
+        hidden_size=32,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        intermediate_size=37,
+    )
+
diff --git a/utils/run_tests.py b/utils/run_tests.py
deleted file mode 100644
index 24722e9d9..000000000
--- a/utils/run_tests.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""
-Runs adapter tests and a subset of other tests relevant for adapter-transformers.
-"""
-import pytest
-
-
-TESTED_MODULES = [
-    "test_adapter",
-    "test_modeling_auto",
-    "test_modeling_bart",
-    "test_modeling_bert",
-    "test_modeling_distilbert",
-    "test_modeling_gpt2",
-    "test_modeling_mbart",
-    "test_modeling_roberta",
-    "test_modeling_xlm_roberta",
-    "test_modeling_encoder_decoder",
-    "test_modeling_t5",
-    "test_trainer",
-]
-
-
-if __name__ == "__main__":
-    test_selection = " or ".join(TESTED_MODULES)
-    args = [
-        "-k",
-        test_selection,
-        "--numprocesses=auto",
-        "--dist=loadfile",
-        "-s",
-        "-v",
-        "--ignore-glob=tests/test_tokenization*",
-        "--ignore-glob=tests/test_processor*",
-        "./tests",
-    ]
-    exit_code = pytest.main(args)
-    exit(exit_code)